ipc/shm.c

   1 /*
   2  * linux/ipc/shm.c
   3  * Copyright (C) 1992, 1993 Krishna Balasubramanian
   4  *       Many improvements/fixes by Bruno Haible.
   5  * Replaced `struct shm_desc' by `struct vm_area_struct', July 1994.
   6  * Fixed the shm swap deallocation (shm_unuse()), August 1998 Andrea Arcangeli.
   7  *
   8  * /proc/sysvipc/shm support (c) 1999 Dragos Acostachioaie <dragos@iname.com>
   9  * BIGMEM support, Andrea Arcangeli <andrea@suse.de>
  10  * SMP thread shm, Jean-Luc Boyard <jean-luc.boyard@siemens.fr>
  11  * HIGHMEM support, Ingo Molnar <mingo@redhat.com>
  12  * avoid vmalloc and make shmmax, shmall, shmmni sysctl'able,
  13  *                         Christoph Rohland <hans-christoph.rohland@sap.com>
  14  * Shared /dev/zero support, Kanoj Sarcar <kanoj@sgi.com>
  15  * make it a file system,  Christoph Rohland <hans-christoph.rohland@sap.com>
  16  *
  17  * The filesystem has the following restrictions/bugs:
  18  * 1) It only can handle one directory.
  19  * 2) Because the directory is represented by the SYSV shm array it
  20  *    can only be mounted one time.
  21  * 3) Private writeable mappings are not supported
  22  * 4) Read and write are not implemented (should they?)
  23  * 5) No special nodes are supported
  24  *
  25  * There are the following mount options:
  26  * - nr_blocks (^= shmall) is the number of blocks of size PAGE_SIZE
  27  *   we are allowed to allocate
  28  * - nr_inodes (^= shmmni) is the number of files we are allowed to
  29  *   allocate
  30  * - mode is the mode for the root directory (default S_IRWXUGO | S_ISVTX)
  31  */
  32
  33 #include <linux/config.h>
  34 #include <linux/module.h>
  35 #include <linux/malloc.h>
  36 #include <linux/shm.h>
  37 #include <linux/swap.h>
  38 #include <linux/smp_lock.h>
  39 #include <linux/init.h>
  40 #include <linux/locks.h>
  41 #include <linux/file.h>
  42 #include <linux/mman.h>
  43 #include <linux/vmalloc.h>
  44 #include <linux/pagemap.h>
  45 #include <linux/proc_fs.h>
  46 #include <linux/highmem.h>
  47
  48 #include <asm/uaccess.h>
  49 #include <asm/pgtable.h>
  50
  51 #include "util.h"
  52
  53 static struct super_block *shm_read_super(struct super_block *,void *, int);
  54 static void           shm_put_super  (struct super_block *);
  55 static int            shm_remount_fs (struct super_block *, int *, char *);
  56 static void           shm_read_inode (struct inode *);
  57 static void           shm_write_inode(struct inode *);
  58 static int            shm_statfs   (struct super_block *, struct statfs *);
  59 static int            shm_create   (struct inode *,struct dentry *,int);
  60 static struct dentry *shm_lookup   (struct inode *,struct dentry *);
  61 static int            shm_unlink   (struct inode *,struct dentry *);
  62 static int            shm_setattr  (struct dentry *dent, struct iattr *attr);
  63 static void           shm_delete   (struct inode *);
  64 static int            shm_mmap     (struct file *, struct vm_area_struct *);
  65 static int            shm_readdir  (struct file *, void *, filldir_t);
  66
  67 #define SHM_NAME_LEN NAME_MAX
  68 #define SHM_FMT ".IPC_%08x"
  69 #define SHM_FMT_LEN 13
  70
  71 /* shm_mode upper byte flags */
  72 /* SHM_DEST and SHM_LOCKED are used in ipcs(8) */
  73 #define PRV_DEST        0010000 /* segment will be destroyed on last detach */
  74 #define PRV_LOCKED      0020000 /* segment will not be swapped */
  75 #define SHM_UNLK        0040000 /* filename is unlinked */
  76 #define SHM_SYSV        0100000 /* It is a SYSV shm segment */
  77
  78 struct shmid_kernel /* private to the kernel */
  79 {
  80         struct kern_ipc_perm    shm_perm;
  81         size_t                  shm_segsz;
  82         unsigned long           shm_nattch;
  83         unsigned long           shm_npages; /* size of segment (pages) */
  84         pte_t                   **shm_dir;  /* ptr to arr of ptrs to frames */
  85         int                     id;
  86         union permap {
  87                 struct shmem {
  88                         time_t                  atime;
  89                         time_t                  dtime;
  90                         time_t                  ctime;
  91                         pid_t                   cpid;
  92                         pid_t                   lpid;
  93                         int                     nlen;
  94                         char                    nm[0];
  95                 } shmem;
  96                 struct zero {
  97                         struct semaphore        sema;
  98                         struct list_head        list;
  99                 } zero;
 100         } permap;
 101 };
 102
 103 #define shm_atim        permap.shmem.atime
 104 #define shm_dtim        permap.shmem.dtime
 105 #define shm_ctim        permap.shmem.ctime
 106 #define shm_cprid       permap.shmem.cpid
 107 #define shm_lprid       permap.shmem.lpid
 108 #define shm_namelen     permap.shmem.nlen
 109 #define shm_name        permap.shmem.nm
 110 #define shm_flags       shm_perm.mode
 111 #define zsem            permap.zero.sema
 112 #define zero_list       permap.zero.list
 113
 114 static struct ipc_ids shm_ids;
 115
 116 #define shm_lock(id)    ((struct shmid_kernel*)ipc_lock(&shm_ids,id))
 117 #define shm_unlock(id)  ipc_unlock(&shm_ids,id)
 118 #define shm_lockall()   ipc_lockall(&shm_ids)
 119 #define shm_unlockall() ipc_unlockall(&shm_ids)
 120 #define shm_get(id)     ((struct shmid_kernel*)ipc_get(&shm_ids,id))
 121 #define shm_buildid(id, seq) \
 122         ipc_buildid(&shm_ids, id, seq)
 123
 124 static int newseg (key_t key, const char *name, int namelen, int shmflg, size_t size);
 125 static void seg_free(struct shmid_kernel *shp, int doacc);
 126 static void shm_open (struct vm_area_struct *shmd);
 127 static void shm_close (struct vm_area_struct *shmd);
 128 static int shm_remove_name(int id);
 129 static struct page * shm_nopage(struct vm_area_struct *, unsigned long, int);
 130 static int shm_swapout(struct page *, struct file *);
 131 #ifdef CONFIG_PROC_FS
 132 static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data);
 133 #endif
 134
 135 static void zshm_swap (int prio, int gfp_mask);
 136 static void zmap_unuse(swp_entry_t entry, struct page *page);
 137 static void shmzero_open(struct vm_area_struct *shmd);
 138 static void shmzero_close(struct vm_area_struct *shmd);
 139 static struct page *shmzero_nopage(struct vm_area_struct * shmd, unsigned long address, int no_share);
 140 static int zero_id;
 141 static struct shmid_kernel zshmid_kernel;
 142 static struct dentry *zdent;
 143
 144 #define SHM_FS_MAGIC 0x02011994
 145
 146 static struct super_block * shm_sb;
 147
 148 static DECLARE_FSTYPE(shm_fs_type, "shm", shm_read_super, FS_SINGLE);
 149
 150 static struct super_operations shm_sops = {
 151         read_inode:     shm_read_inode,
 152         write_inode:    shm_write_inode,
 153         delete_inode:   shm_delete,
 154         put_super:      shm_put_super,
 155         statfs:         shm_statfs,
 156         remount_fs:     shm_remount_fs,
 157 };
 158
 159 static struct file_operations shm_root_operations = {
 160         readdir:        shm_readdir,
 161 };
 162
 163 static struct inode_operations shm_root_inode_operations = {
 164         create:         shm_create,
 165         lookup:         shm_lookup,
 166         unlink:         shm_unlink,
 167 };
 168
 169 static struct file_operations shm_file_operations = {
 170         mmap:   shm_mmap,
 171 };
 172
 173 static struct inode_operations shm_inode_operations = {
 174         setattr:        shm_setattr,
 175 };
 176
 177 static struct vm_operations_struct shm_vm_ops = {
 178         open:   shm_open,       /* callback for a new vm-area open */
 179         close:  shm_close,      /* callback for when the vm-area is released */
 180         nopage: shm_nopage,
 181         swapout:shm_swapout,
 182 };
 183
 184 size_t shm_ctlmax = SHMMAX;
 185
 186 /* These parameters should be part of the superblock */
 187 static int shm_ctlall;
 188 static int shm_ctlmni;
 189 static int shm_mode;
 190
 191 static int shm_tot = 0; /* total number of shared memory pages */
 192 static int shm_rss = 0; /* number of shared memory pages that are in memory */
 193 static int shm_swp = 0; /* number of shared memory pages that are in swap */
 194
 195 /* locks order:
 196         pagecache_lock
 197         shm_lock()/shm_lockall()
 198         kernel lock
 199         inode->i_sem
 200         sem_ids.sem
 201         mmap_sem
 202
 203   SMP assumptions:
 204   - swap_free() never sleeps
 205   - add_to_swap_cache() never sleeps
 206   - add_to_swap_cache() doesn't acquire the big kernel lock.
 207   - shm_unuse() is called with the kernel lock acquired.
 208  */
 209
 210 /* some statistics */
 211 static ulong swap_attempts = 0;
 212 static ulong swap_successes = 0;
 213 static ulong used_segs = 0;
 214
 215 void __init shm_init (void)
 216 {
 217         struct vfsmount *res;
 218         ipc_init_ids(&shm_ids, 1);
 219
 220         register_filesystem (&shm_fs_type);
 221         res = kern_mount(&shm_fs_type);
 222         if (IS_ERR(res)) {
 223                 unregister_filesystem(&shm_fs_type);
 224                 return;
 225         }
 226 #ifdef CONFIG_PROC_FS
 227         create_proc_read_entry("sysvipc/shm", 0, 0, sysvipc_shm_read_proc, NULL);
 228 #endif
 229         zero_id = ipc_addid(&shm_ids, &zshmid_kernel.shm_perm, 1);
 230         shm_unlock(zero_id);
 231         INIT_LIST_HEAD(&zshmid_kernel.zero_list);
 232         zdent = d_alloc_root(get_empty_inode());
 233         return;
 234 }
 235
 236 static int shm_parse_options(char *options)
 237 {
 238         int blocks = shm_ctlall;
 239         int inodes = shm_ctlmni;
 240         umode_t mode = shm_mode;
 241         char *this_char, *value;
 242
 243         this_char = NULL;
 244         if ( options )
 245                 this_char = strtok(options,",");
 246         for ( ; this_char; this_char = strtok(NULL,",")) {
 247                 if ((value = strchr(this_char,'=')) != NULL)
 248                         *value++ = 0;
 249                 if (!strcmp(this_char,"nr_blocks")) {
 250                         if (!value || !*value)
 251                                 return 1;
 252                         blocks = simple_strtoul(value,&value,0);
 253                         if (*value)
 254                                 return 1;
 255                 }
 256                 else if (!strcmp(this_char,"nr_inodes")) {
 257                         if (!value || !*value)
 258                                 return 1;
 259                         inodes = simple_strtoul(value,&value,0);
 260                         if (*value)
 261                                 return 1;
 262                 }
 263                 else if (!strcmp(this_char,"mode")) {
 264                         if (!value || !*value)
 265                                 return 1;
 266                         mode = simple_strtoul(value,&value,8);
 267                         if (*value)
 268                                 return 1;
 269                 }
 270                 else
 271                         return 1;
 272         }
 273         shm_ctlmni = inodes;
 274         shm_ctlall = blocks;
 275         shm_mode   = mode;
 276
 277         return 0;
 278 }
 279
 280 static struct super_block *shm_read_super(struct super_block *s,void *data,
 281                                           int silent)
 282 {
 283         struct inode * root_inode;
 284
 285         shm_ctlall = SHMALL;
 286         shm_ctlmni = SHMMNI;
 287         shm_mode   = S_IRWXUGO | S_ISVTX;
 288         if (shm_parse_options (data)) {
 289                 printk(KERN_ERR "shm fs invalid option\n");
 290                 goto out_unlock;
 291         }
 292
 293         s->s_blocksize = PAGE_SIZE;
 294         s->s_blocksize_bits = PAGE_SHIFT;
 295         s->s_magic = SHM_FS_MAGIC;
 296         s->s_op = &shm_sops;
 297         root_inode = iget (s, SEQ_MULTIPLIER);
 298         if (!root_inode)
 299                 goto out_no_root;
 300         root_inode->i_op = &shm_root_inode_operations;
 301         root_inode->i_sb = s;
 302         root_inode->i_nlink = 2;
 303         root_inode->i_mode = S_IFDIR | shm_mode;
 304         s->s_root = d_alloc_root(root_inode);
 305         if (!s->s_root)
 306                 goto out_no_root;
 307         shm_sb = s;
 308         return s;
 309
 310 out_no_root:
 311         printk(KERN_ERR "proc_read_super: get root inode failed\n");
 312         iput(root_inode);
 313 out_unlock:
 314         return NULL;
 315 }
 316
 317 static int shm_remount_fs (struct super_block *sb, int *flags, char *data)
 318 {
 319         if (shm_parse_options (data))
 320                 return -EINVAL;
 321         return 0;
 322 }
 323
 324 static inline int shm_checkid(struct shmid_kernel *s, int id)
 325 {
 326         if (!(s->shm_flags & SHM_SYSV))
 327                 return -EINVAL;
 328         if (ipc_checkid(&shm_ids,&s->shm_perm,id))
 329                 return -EIDRM;
 330         return 0;
 331 }
 332
 333 static inline struct shmid_kernel *shm_rmid(int id)
 334 {
 335         return (struct shmid_kernel *)ipc_rmid(&shm_ids,id);
 336 }
 337
 338 static __inline__ int shm_addid(struct shmid_kernel *shp)
 339 {
 340         return ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni+1);
 341 }
 342
 343 static void shm_put_super(struct super_block *sb)
 344 {
 345         int i;
 346         struct shmid_kernel *shp;
 347
 348         down(&shm_ids.sem);
 349         for(i = 0; i <= shm_ids.max_id; i++) {
 350                 if (i == zero_id)
 351                         continue;
 352                 if (!(shp = shm_lock (i)))
 353                         continue;
 354                 if (shp->shm_nattch)
 355                         printk(KERN_DEBUG "shm_nattch = %ld\n", shp->shm_nattch);
 356                 shp = shm_rmid(i);
 357                 shm_unlock(i);
 358                 seg_free(shp, 1);
 359         }
 360         dput (sb->s_root);
 361         up(&shm_ids.sem);
 362 }
 363
 364 static int shm_statfs(struct super_block *sb, struct statfs *buf)
 365 {
 366         buf->f_type = SHM_FS_MAGIC;
 367         buf->f_bsize = PAGE_SIZE;
 368         buf->f_blocks = shm_ctlall;
 369         buf->f_bavail = buf->f_bfree = shm_ctlall - shm_tot;
 370         buf->f_files = shm_ctlmni;
 371         buf->f_ffree = shm_ctlmni - used_segs;
 372         buf->f_namelen = SHM_NAME_LEN;
 373         return 0;
 374 }
 375
 376 static void shm_write_inode(struct inode * inode)
 377 {
 378 }
 379
 380 static void shm_read_inode(struct inode * inode)
 381 {
 382         int id;
 383         struct shmid_kernel *shp;
 384
 385         id = inode->i_ino;
 386         inode->i_op = NULL;
 387         inode->i_mode = 0;
 388         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 389
 390         if (id < SEQ_MULTIPLIER) {
 391                 if (!(shp = shm_lock (id)))
 392                         return;
 393                 inode->i_mode = (shp->shm_flags & S_IALLUGO) | S_IFREG;
 394                 inode->i_uid  = shp->shm_perm.uid;
 395                 inode->i_gid  = shp->shm_perm.gid;
 396                 inode->i_size = shp->shm_segsz;
 397                 shm_unlock (id);
 398                 inode->i_op  = &shm_inode_operations;
 399                 inode->i_fop = &shm_file_operations;
 400                 return;
 401         }
 402         inode->i_op    = &shm_root_inode_operations;
 403         inode->i_fop   = &shm_root_operations;
 404         inode->i_sb    = shm_sb;
 405         inode->i_nlink = 2;
 406         inode->i_mode  = S_IFDIR | shm_mode;
 407         inode->i_uid   = inode->i_gid = 0;
 408
 409 }
 410
 411 static int shm_create (struct inode *dir, struct dentry *dent, int mode)
 412 {
 413         int id, err;
 414         struct inode * inode;
 415
 416         down(&shm_ids.sem);
 417         err = id = newseg (IPC_PRIVATE, dent->d_name.name, dent->d_name.len, mode, 0);
 418         if (err < 0)
 419                 goto out;
 420
 421         err = -ENOMEM;
 422         inode = iget (shm_sb, id % SEQ_MULTIPLIER);
 423         if (!inode)
 424                 goto out;
 425
 426         err = 0;
 427         down (&inode->i_sem);
 428         inode->i_mode = mode | S_IFREG;
 429         inode->i_op   = &shm_inode_operations;
 430         d_instantiate(dent, inode);
 431         up (&inode->i_sem);
 432
 433 out:
 434         up(&shm_ids.sem);
 435         return err;
 436 }
 437
 438 static int shm_readdir (struct file *filp, void *dirent, filldir_t filldir)
 439 {
 440         struct inode * inode = filp->f_dentry->d_inode;
 441         struct shmid_kernel *shp;
 442         off_t nr;
 443
 444         nr = filp->f_pos;
 445
 446         switch(nr)
 447         {
 448         case 0:
 449                 if (filldir(dirent, ".", 1, nr, inode->i_ino) < 0)
 450                         return 0;
 451                 filp->f_pos = ++nr;
 452                 /* fall through */
 453         case 1:
 454                 if (filldir(dirent, "..", 2, nr, inode->i_ino) < 0)
 455                         return 0;
 456                 filp->f_pos = ++nr;
 457                 /* fall through */
 458         default:
 459                 down(&shm_ids.sem);
 460                 for (; nr-2 <= shm_ids.max_id; nr++ ) {
 461                         if (nr-2 == zero_id)
 462                                 continue;
 463                         if (!(shp = shm_get (nr-2)))
 464                                 continue;
 465                         if (shp->shm_flags & SHM_UNLK)
 466                                 continue;
 467                         if (filldir(dirent, shp->shm_name, shp->shm_namelen, nr, nr) < 0 )
 468                                 break;;
 469                 }
 470                 filp->f_pos = nr;
 471                 up(&shm_ids.sem);
 472                 break;
 473         }
 474
 475         UPDATE_ATIME(inode);
 476         return 0;
 477 }
 478
 479 static struct dentry *shm_lookup (struct inode *dir, struct dentry *dent)
 480 {
 481         int i, err = 0;
 482         struct shmid_kernel* shp;
 483         struct inode *inode = NULL;
 484
 485         if (dent->d_name.len > SHM_NAME_LEN)
 486                 return ERR_PTR(-ENAMETOOLONG);
 487
 488         down(&shm_ids.sem);
 489         for(i = 0; i <= shm_ids.max_id; i++) {
 490                 if (i == zero_id)
 491                         continue;
 492                 if (!(shp = shm_lock(i)))
 493                         continue;
 494                 if (!(shp->shm_flags & SHM_UNLK) &&
 495                     dent->d_name.len == shp->shm_namelen &&
 496                     strncmp(dent->d_name.name, shp->shm_name, shp->shm_namelen) == 0)
 497                         goto found;
 498                 shm_unlock(i);
 499         }
 500
 501         /*
 502          * prevent the reserved names as negative dentries.
 503          * This also prevents object creation through the filesystem
 504          */
 505         if (dent->d_name.len == SHM_FMT_LEN &&
 506             memcmp (SHM_FMT, dent->d_name.name, SHM_FMT_LEN - 8) == 0)
 507                 err = -EINVAL;  /* EINVAL to give IPC_RMID the right error */
 508
 509         goto out;
 510
 511 found:
 512         shm_unlock(i);
 513         inode = iget(dir->i_sb, i);
 514
 515         if (!inode)
 516                 err = -EACCES;
 517 out:
 518         if (err == 0)
 519                 d_add (dent, inode);
 520         up (&shm_ids.sem);
 521         return ERR_PTR(err);
 522 }
 523
 524 static int shm_unlink (struct inode *dir, struct dentry *dent)
 525 {
 526         struct inode * inode = dent->d_inode;
 527         struct shmid_kernel *shp;
 528
 529         down (&shm_ids.sem);
 530         if (!(shp = shm_lock (inode->i_ino)))
 531                 BUG();
 532         shp->shm_flags |= SHM_UNLK | PRV_DEST;
 533         shp->shm_perm.key = IPC_PRIVATE; /* Do not find it any more */
 534         shm_unlock (inode->i_ino);
 535         up (&shm_ids.sem);
 536         inode->i_nlink -= 1;
 537         /*
 538          * If it's a reserved name we have to drop the dentry instead
 539          * of creating a negative dentry
 540          */
 541         if (dent->d_name.len == SHM_FMT_LEN &&
 542             memcmp (SHM_FMT, dent->d_name.name, SHM_FMT_LEN - 8) == 0)
 543                 d_drop (dent);
 544         else
 545                 d_delete (dent);
 546         return 0;
 547 }
 548
 549 #define SHM_ENTRY(shp, index) (shp)->shm_dir[(index)/PTRS_PER_PTE][(index)%PTRS_PER_PTE]
 550
 551 static pte_t **shm_alloc(unsigned long pages, int doacc)
 552 {
 553         unsigned short dir  = pages / PTRS_PER_PTE;
 554         unsigned short last = pages % PTRS_PER_PTE;
 555         pte_t **ret, **ptr, *pte;
 556
 557         if (pages == 0)
 558                 return NULL;
 559
 560         ret = kmalloc ((dir+1) * sizeof(pte_t *), GFP_KERNEL);
 561         if (!ret)
 562                 goto nomem;
 563
 564         for (ptr = ret; ptr < ret+dir ; ptr++)
 565         {
 566                 *ptr = (pte_t *)__get_free_page (GFP_KERNEL);
 567                 if (!*ptr)
 568                         goto free;
 569                 for (pte = *ptr; pte < *ptr + PTRS_PER_PTE; pte++)
 570                         pte_clear (pte);
 571         }
 572
 573         /* The last one is probably not of PAGE_SIZE: we use kmalloc */
 574         if (last) {
 575                 *ptr = kmalloc (last*sizeof(pte_t), GFP_KERNEL);
 576                 if (!*ptr)
 577                         goto free;
 578                 for (pte = *ptr; pte < *ptr + last; pte++)
 579                         pte_clear (pte);
 580         }
 581         if (doacc) {
 582                 shm_lockall();
 583                 shm_tot += pages;
 584                 used_segs++;
 585                 shm_unlockall();
 586         }
 587         return ret;
 588
 589 free:
 590         /* The last failed: we decrement first */
 591         while (--ptr >= ret)
 592                 free_page ((unsigned long)*ptr);
 593
 594         kfree (ret);
 595 nomem:
 596         return ERR_PTR(-ENOMEM);
 597 }
 598
 599 static void shm_free(pte_t** dir, unsigned long pages, int doacc)
 600 {
 601         int i, rss, swp;
 602         pte_t **ptr = dir+pages/PTRS_PER_PTE;
 603
 604         if (!dir)
 605                 return;
 606
 607         for (i = 0, rss = 0, swp = 0; i < pages ; i++) {
 608                 pte_t pte;
 609                 pte = dir[i/PTRS_PER_PTE][i%PTRS_PER_PTE];
 610                 if (pte_none(pte))
 611                         continue;
 612                 if (pte_present(pte)) {
 613                         __free_page (pte_page(pte));
 614                         rss++;
 615                 } else {
 616                         swap_free(pte_to_swp_entry(pte));
 617                         swp++;
 618                 }
 619         }
 620
 621         /* first the last page */
 622         if (pages%PTRS_PER_PTE)
 623                 kfree (*ptr);
 624         /* now the whole pages */
 625         while (--ptr >= dir)
 626                 if (*ptr)
 627                         free_page ((unsigned long)*ptr);
 628
 629         /* Now the indirect block */
 630         kfree (dir);
 631
 632         if (doacc) {
 633                 shm_lockall();
 634                 shm_rss -= rss;
 635                 shm_swp -= swp;
 636                 shm_tot -= pages;
 637                 used_segs--;
 638                 shm_unlockall();
 639         }
 640 }
 641
 642 static  int shm_setattr (struct dentry *dentry, struct iattr *attr)
 643 {
 644         int error;
 645         struct inode *inode = dentry->d_inode;
 646         struct shmid_kernel *shp;
 647         unsigned long new_pages, old_pages;
 648         pte_t **new_dir, **old_dir;
 649
 650         error = inode_change_ok(inode, attr);
 651         if (error)
 652                 return error;
 653         if (!(attr->ia_valid & ATTR_SIZE))
 654                 goto set_attr;
 655         if (attr->ia_size > shm_ctlmax)
 656                 return -EFBIG;
 657
 658         /* We set old_pages and old_dir for easier cleanup */
 659         old_pages = new_pages = (attr->ia_size  + PAGE_SIZE - 1) >> PAGE_SHIFT;
 660         old_dir = new_dir = shm_alloc(new_pages, 1);
 661         if (IS_ERR(new_dir))
 662                 return PTR_ERR(new_dir);
 663
 664         if (!(shp = shm_lock(inode->i_ino)))
 665                 BUG();
 666         error = -ENOSPC;
 667         if (shm_tot - shp->shm_npages >= shm_ctlall)
 668                 goto out;
 669         error = 0;
 670         if (shp->shm_segsz == attr->ia_size)
 671                 goto out;
 672         /* Now we set them to the real values */
 673         old_dir = shp->shm_dir;
 674         old_pages = shp->shm_npages;
 675         if (old_dir){
 676                 pte_t *swap;
 677                 int i,j;
 678                 i = old_pages < new_pages ? old_pages : new_pages;
 679                 j = i % PTRS_PER_PTE;
 680                 i /= PTRS_PER_PTE;
 681                 if (j)
 682                         memcpy (new_dir[i], old_dir[i], j * sizeof (pte_t));
 683                 while (i--) {
 684                         swap = new_dir[i];
 685                         new_dir[i] = old_dir[i];
 686                         old_dir[i] = swap;
 687                 }
 688         }
 689         shp->shm_dir = new_dir;
 690         shp->shm_npages = new_pages;
 691         shp->shm_segsz = attr->ia_size;
 692 out:
 693         shm_unlock(inode->i_ino);
 694         shm_free (old_dir, old_pages, 1);
 695 set_attr:
 696         inode_setattr(inode, attr);
 697         return error;
 698 }
 699
 700 static struct shmid_kernel *seg_alloc(int numpages, size_t namelen)
 701 {
 702         struct shmid_kernel *shp;
 703         pte_t              **dir;
 704
 705         shp = (struct shmid_kernel *) kmalloc (sizeof (*shp) + namelen, GFP_KERNEL);
 706         if (!shp)
 707                 return ERR_PTR(-ENOMEM);
 708
 709         dir = shm_alloc (numpages, namelen);
 710         if (IS_ERR(dir)) {
 711                 kfree(shp);
 712                 return ERR_PTR(PTR_ERR(dir));
 713         }
 714         shp->shm_dir    = dir;
 715         shp->shm_npages = numpages;
 716         shp->shm_nattch = 0;
 717         shp->shm_namelen = namelen;
 718         return(shp);
 719 }
 720
 721 static void seg_free(struct shmid_kernel *shp, int doacc)
 722 {
 723         shm_free (shp->shm_dir, shp->shm_npages, doacc);
 724         kfree(shp);
 725 }
 726
 727 static int newseg (key_t key, const char *name, int namelen,
 728                    int shmflg, size_t size)
 729 {
 730         struct shmid_kernel *shp;
 731         int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT;
 732         int id;
 733
 734         if (namelen > SHM_NAME_LEN)
 735                 return -ENAMETOOLONG;
 736
 737         if (size > shm_ctlmax)
 738                 return -EINVAL;
 739
 740         if (shm_tot + numpages >= shm_ctlall)
 741                 return -ENOSPC;
 742
 743         if (!(shp = seg_alloc(numpages, namelen ? namelen : SHM_FMT_LEN + 1)))
 744                 return -ENOMEM;
 745         id = shm_addid(shp);
 746         if(id == -1) {
 747                 seg_free(shp, 1);
 748                 return -ENOSPC;
 749         }
 750         shp->shm_perm.key = key;
 751         shp->shm_flags = (shmflg & S_IRWXUGO);
 752         shp->shm_segsz = size;
 753         shp->shm_cprid = current->pid;
 754         shp->shm_lprid = 0;
 755         shp->shm_atim = shp->shm_dtim = 0;
 756         shp->shm_ctim = CURRENT_TIME;
 757         shp->id = shm_buildid(id,shp->shm_perm.seq);
 758         if (namelen != 0) {
 759                 shp->shm_namelen = namelen;
 760                 memcpy (shp->shm_name, name, namelen);
 761         } else {
 762                 shp->shm_flags |= SHM_SYSV;
 763                 shp->shm_namelen = sprintf (shp->shm_name, SHM_FMT, shp->id);
 764         }
 765         shm_unlock(id);
 766
 767         return shp->id;
 768 }
 769
 770 asmlinkage long sys_shmget (key_t key, size_t size, int shmflg)
 771 {
 772         struct shmid_kernel *shp;
 773         int err, id = 0;
 774
 775         if (size < SHMMIN)
 776                 return -EINVAL;
 777
 778         down(&shm_ids.sem);
 779         if (key == IPC_PRIVATE) {
 780                 err = newseg(key, NULL, 0, shmflg, size);
 781         } else if ((id = ipc_findkey(&shm_ids,key)) == -1) {
 782                 if (!(shmflg & IPC_CREAT))
 783                         err = -ENOENT;
 784                 else
 785                         err = newseg(key, NULL, 0, shmflg, size);
 786         } else if ((shmflg & IPC_CREAT) && (shmflg & IPC_EXCL)) {
 787                 err = -EEXIST;
 788         } else {
 789                 shp = shm_lock(id);
 790                 if(shp==NULL)
 791                         BUG();
 792                 if (shp->shm_segsz < size)
 793                         err = -EINVAL;
 794                 else if (ipcperms(&shp->shm_perm, shmflg))
 795                         err = -EACCES;
 796                 else
 797                         err = shm_buildid(id, shp->shm_perm.seq);
 798                 shm_unlock(id);
 799         }
 800         up(&shm_ids.sem);
 801         return err;
 802 }
 803
 804 static void shm_delete (struct inode *ino)
 805 {
 806         int shmid = ino->i_ino;
 807         struct shmid_kernel *shp;
 808
 809         down(&shm_ids.sem);
 810         shp = shm_lock(shmid);
 811         if(shp==NULL) {
 812                 BUG();
 813         }
 814         shp = shm_rmid(shmid);
 815         shm_unlock(shmid);
 816         up(&shm_ids.sem);
 817         seg_free(shp, 1);
 818         clear_inode(ino);
 819 }
 820
 821 static inline unsigned long copy_shmid_to_user(void *buf, struct shmid64_ds *in, int version)
 822 {
 823         switch(version) {
 824         case IPC_64:
 825                 return copy_to_user(buf, in, sizeof(*in));
 826         case IPC_OLD:
 827             {
 828                 struct shmid_ds out;
 829
 830                 ipc64_perm_to_ipc_perm(&in->shm_perm, &out.shm_perm);
 831                 out.shm_segsz   = in->shm_segsz;
 832                 out.shm_atime   = in->shm_atime;
 833                 out.shm_dtime   = in->shm_dtime;
 834                 out.shm_ctime   = in->shm_ctime;
 835                 out.shm_cpid    = in->shm_cpid;
 836                 out.shm_lpid    = in->shm_lpid;
 837                 out.shm_nattch  = in->shm_nattch;
 838
 839                 return copy_to_user(buf, &out, sizeof(out));
 840             }
 841         default:
 842                 return -EINVAL;
 843         }
 844 }
 845
 846 struct shm_setbuf {
 847         uid_t   uid;
 848         gid_t   gid;
 849         mode_t  mode;
 850 };
 851
 852 static inline unsigned long copy_shmid_from_user(struct shm_setbuf *out, void *buf, int version)
 853 {
 854         switch(version) {
 855         case IPC_64:
 856             {
 857                 struct shmid64_ds tbuf;
 858
 859                 if (copy_from_user(&tbuf, buf, sizeof(tbuf)))
 860                         return -EFAULT;
 861
 862                 out->uid        = tbuf.shm_perm.uid;
 863                 out->gid        = tbuf.shm_perm.gid;
 864                 out->mode       = tbuf.shm_flags;
 865
 866                 return 0;
 867             }
 868         case IPC_OLD:
 869             {
 870                 struct shmid_ds tbuf_old;
 871
 872                 if (copy_from_user(&tbuf_old, buf, sizeof(tbuf_old)))
 873                         return -EFAULT;
 874
 875                 out->uid        = tbuf_old.shm_perm.uid;
 876                 out->gid        = tbuf_old.shm_perm.gid;
 877                 out->mode       = tbuf_old.shm_flags;
 878
 879                 return 0;
 880             }
 881         default:
 882                 return -EINVAL;
 883         }
 884 }
 885
 886 static inline unsigned long copy_shminfo_to_user(void *buf, struct shminfo64 *in, int version)
 887 {
 888         switch(version) {
 889         case IPC_64:
 890                 return copy_to_user(buf, in, sizeof(*in));
 891         case IPC_OLD:
 892             {
 893                 struct shminfo out;
 894
 895                 if(in->shmmax > INT_MAX)
 896                         out.shmmax = INT_MAX;
 897                 else
 898                         out.shmmax = (int)in->shmmax;
 899
 900                 out.shmmin      = in->shmmin;
 901                 out.shmmni      = in->shmmni;
 902                 out.shmseg      = in->shmseg;
 903                 out.shmall      = in->shmall;
 904
 905                 return copy_to_user(buf, &out, sizeof(out));
 906             }
 907         default:
 908                 return -EINVAL;
 909         }
 910 }
 911
 912 asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds *buf)
 913 {
 914         struct shm_setbuf setbuf;
 915         struct shmid_kernel *shp;
 916         int err, version;
 917
 918         if (cmd < 0 || shmid < 0)
 919                 return -EINVAL;
 920
 921         version = ipc_parse_version(&cmd);
 922
 923         switch (cmd) { /* replace with proc interface ? */
 924         case IPC_INFO:
 925         {
 926                 struct shminfo64 shminfo;
 927
 928                 memset(&shminfo,0,sizeof(shminfo));
 929                 shminfo.shmmni = shminfo.shmseg = shm_ctlmni;
 930                 shminfo.shmmax = shm_ctlmax;
 931                 shminfo.shmall = shm_ctlall;
 932
 933                 shminfo.shmmin = SHMMIN;
 934                 if(copy_shminfo_to_user (buf, &shminfo, version))
 935                         return -EFAULT;
 936                 /* reading a integer is always atomic */
 937                 err= shm_ids.max_id;
 938                 if(err<0)
 939                         err = 0;
 940                 return err;
 941         }
 942         case SHM_INFO:
 943         {
 944                 struct shm_info shm_info;
 945
 946                 memset(&shm_info,0,sizeof(shm_info));
 947                 shm_lockall();
 948                 shm_info.used_ids = shm_ids.in_use;
 949                 shm_info.shm_rss = shm_rss;
 950                 shm_info.shm_tot = shm_tot;
 951                 shm_info.shm_swp = shm_swp;
 952                 shm_info.swap_attempts = swap_attempts;
 953                 shm_info.swap_successes = swap_successes;
 954                 err = shm_ids.max_id;
 955                 shm_unlockall();
 956                 if(copy_to_user (buf, &shm_info, sizeof(shm_info)))
 957                         return -EFAULT;
 958
 959                 return err < 0 ? 0 : err;
 960         }
 961         case SHM_STAT:
 962         case IPC_STAT:
 963         {
 964                 struct shmid64_ds tbuf;
 965                 int result;
 966                 if ((shmid % SEQ_MULTIPLIER) == zero_id)
 967                         return -EINVAL;
 968                 memset(&tbuf, 0, sizeof(tbuf));
 969                 shp = shm_lock(shmid);
 970                 if(shp==NULL)
 971                         return -EINVAL;
 972                 if(cmd==SHM_STAT) {
 973                         err = -EINVAL;
 974                         if (!(shp->shm_flags & SHM_SYSV) ||
 975                             shmid > shm_ids.max_id)
 976                                 goto out_unlock;
 977                         result = shm_buildid(shmid, shp->shm_perm.seq);
 978                 } else {
 979                         err = shm_checkid(shp,shmid);
 980                         if(err)
 981                                 goto out_unlock;
 982                         result = 0;
 983                 }
 984                 err=-EACCES;
 985                 if (ipcperms (&shp->shm_perm, S_IRUGO))
 986                         goto out_unlock;
 987                 kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm);
 988                 /* ugly hack to keep binary compatibility for ipcs */
 989                 tbuf.shm_flags &= PRV_DEST | PRV_LOCKED | S_IRWXUGO;
 990                 if (tbuf.shm_flags & PRV_DEST)
 991                         tbuf.shm_flags |= SHM_DEST;
 992                 if (tbuf.shm_flags & PRV_LOCKED)
 993                         tbuf.shm_flags |= SHM_LOCKED;
 994                 tbuf.shm_flags &= SHM_DEST | SHM_LOCKED | S_IRWXUGO;
 995                 tbuf.shm_segsz  = shp->shm_segsz;
 996                 tbuf.shm_atime  = shp->shm_atim;
 997                 tbuf.shm_dtime  = shp->shm_dtim;
 998                 tbuf.shm_ctime  = shp->shm_ctim;
 999                 tbuf.shm_cpid   = shp->shm_cprid;
1000                 tbuf.shm_lpid   = shp->shm_lprid;
1001                 tbuf.shm_nattch = shp->shm_nattch;
1002                 shm_unlock(shmid);
1003                 if(copy_shmid_to_user (buf, &tbuf, version))
1004                         return -EFAULT;
1005                 return result;
1006         }
1007         case SHM_LOCK:
1008         case SHM_UNLOCK:
1009         {
1010 /* Allow superuser to lock segment in memory */
1011 /* Should the pages be faulted in here or leave it to user? */
1012 /* need to determine interaction with current->swappable */
1013                 if ((shmid % SEQ_MULTIPLIER)== zero_id)
1014                         return -EINVAL;
1015                 if (!capable(CAP_IPC_LOCK))
1016                         return -EPERM;
1017
1018                 shp = shm_lock(shmid);
1019                 if(shp==NULL)
1020                         return -EINVAL;
1021                 err = shm_checkid(shp,shmid);
1022                 if(err)
1023                         goto out_unlock;
1024                 if(cmd==SHM_LOCK)
1025                         shp->shm_flags |= PRV_LOCKED;
1026                 else
1027                         shp->shm_flags &= ~PRV_LOCKED;
1028                 shm_unlock(shmid);
1029                 return err;
1030         }
1031         case IPC_RMID:
1032         {
1033                 /*
1034                  *      We cannot simply remove the file. The SVID states
1035                  *      that the block remains until the last person
1036                  *      detaches from it, then is deleted. A shmat() on
1037                  *      an RMID segment is legal in older Linux and if
1038                  *      we change it apps break...
1039                  *
1040                  *      Instead we set a destroyed flag, and then blow
1041                  *      the name away when the usage hits zero.
1042                  */
1043                 if ((shmid % SEQ_MULTIPLIER) == zero_id)
1044                         return -EINVAL;
1045                 down(&shm_ids.sem);
1046                 shp = shm_lock(shmid);
1047                 if (shp == NULL) {
1048                         up(&shm_ids.sem);
1049                         return -EINVAL;
1050                 }
1051                 err = shm_checkid(shp, shmid);
1052                 if (err == 0) {
1053                         if (shp->shm_nattch == 0 &&
1054                             !(shp->shm_flags & SHM_UNLK)) {
1055                                 int id=shp->id;
1056                                 shm_unlock(shmid);
1057                                 up(&shm_ids.sem);
1058                                 /*
1059                                  * We can't hold shm_lock here else we
1060                                  * will deadlock in shm_lookup when we
1061                                  * try to recursively grab it.
1062                                  */
1063                                 return shm_remove_name(id);
1064                         }
1065                         shp->shm_flags |= PRV_DEST;
1066                         /* Do not find it any more */
1067                         shp->shm_perm.key = IPC_PRIVATE;
1068                 }
1069                 /* Unlock */
1070                 shm_unlock(shmid);
1071                 up(&shm_ids.sem);
1072                 return err;
1073         }
1074
1075         case IPC_SET:
1076         {
1077                 if ((shmid % SEQ_MULTIPLIER)== zero_id)
1078                         return -EINVAL;
1079
1080                 if(copy_shmid_from_user (&setbuf, buf, version))
1081                         return -EFAULT;
1082                 down(&shm_ids.sem);
1083                 shp = shm_lock(shmid);
1084                 err=-EINVAL;
1085                 if(shp==NULL)
1086                         goto out_up;
1087                 err = shm_checkid(shp,shmid);
1088                 if(err)
1089                         goto out_unlock_up;
1090                 err=-EPERM;
1091                 if (current->euid != shp->shm_perm.uid &&
1092                     current->euid != shp->shm_perm.cuid &&
1093                     !capable(CAP_SYS_ADMIN)) {
1094                         goto out_unlock_up;
1095                 }
1096
1097                 shp->shm_perm.uid = setbuf.uid;
1098                 shp->shm_perm.gid = setbuf.gid;
1099                 shp->shm_flags = (shp->shm_flags & ~S_IRWXUGO)
1100                         | (setbuf.mode & S_IRWXUGO);
1101                 shp->shm_ctim = CURRENT_TIME;
1102                 break;
1103         }
1104
1105         default:
1106                 return -EINVAL;
1107         }
1108
1109         err = 0;
1110 out_unlock_up:
1111         shm_unlock(shmid);
1112 out_up:
1113         up(&shm_ids.sem);
1114         return err;
1115 out_unlock:
1116         shm_unlock(shmid);
1117         return err;
1118 }
1119
1120 static inline void shm_inc (int id) {
1121         struct shmid_kernel *shp;
1122
1123         if(!(shp = shm_lock(id)))
1124                 BUG();
1125         shp->shm_atim = CURRENT_TIME;
1126         shp->shm_lprid = current->pid;
1127         shp->shm_nattch++;
1128         shm_unlock(id);
1129 }
1130
1131 static int shm_mmap(struct file * file, struct vm_area_struct * vma)
1132 {
1133         if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED))
1134                 return -EINVAL; /* we cannot do private writable mappings */
1135         UPDATE_ATIME(file->f_dentry->d_inode);
1136         vma->vm_ops = &shm_vm_ops;
1137         shm_inc(file->f_dentry->d_inode->i_ino);
1138         return 0;
1139 }
1140
1141 /*
1142  * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists.
1143  */
1144 asmlinkage long sys_shmat (int shmid, char *shmaddr, int shmflg, ulong *raddr)
1145 {
1146         unsigned long addr;
1147         struct file * file;
1148         int    err;
1149         unsigned long flags;
1150         unsigned long prot;
1151         unsigned long o_flags;
1152         int acc_mode;
1153         struct dentry *dentry;
1154         char   name[SHM_FMT_LEN+1];
1155
1156         if (!shm_sb || (shmid % SEQ_MULTIPLIER) == zero_id)
1157                 return -EINVAL;
1158
1159         if ((addr = (ulong)shmaddr)) {
1160                 if (addr & (SHMLBA-1)) {
1161                         if (shmflg & SHM_RND)
1162                                 addr &= ~(SHMLBA-1);       /* round down */
1163                         else
1164                                 return -EINVAL;
1165                 }
1166                 flags = MAP_SHARED | MAP_FIXED;
1167         } else
1168                 flags = MAP_SHARED;
1169
1170         if (shmflg & SHM_RDONLY) {
1171                 prot = PROT_READ;
1172                 o_flags = O_RDONLY;
1173                 acc_mode = MAY_READ;
1174         } else {
1175                 prot = PROT_READ | PROT_WRITE;
1176                 o_flags = O_RDWR;
1177                 acc_mode = MAY_READ | MAY_WRITE;
1178         }
1179
1180         sprintf (name, SHM_FMT, shmid);
1181
1182         lock_kernel();
1183         mntget(shm_fs_type.kern_mnt);
1184         dentry = lookup_one(name, lock_parent(shm_sb->s_root));
1185         unlock_dir(shm_sb->s_root);
1186         err = PTR_ERR(dentry);
1187         if (IS_ERR(dentry))
1188                 goto bad_file;
1189         err = -ENOENT;
1190         if (!dentry->d_inode)
1191                 goto bad_file;
1192         err = permission(dentry->d_inode, acc_mode);
1193         if (err)
1194                 goto bad_file1;
1195         file = dentry_open(dentry, shm_fs_type.kern_mnt, o_flags);
1196         err = PTR_ERR(file);
1197         if (IS_ERR (file))
1198                 goto bad_file1;
1199         down(&current->mm->mmap_sem);
1200         *raddr = do_mmap (file, addr, file->f_dentry->d_inode->i_size,
1201                           prot, flags, 0);
1202         up(&current->mm->mmap_sem);
1203         unlock_kernel();
1204         if (IS_ERR(*raddr))
1205                 err = PTR_ERR(*raddr);
1206         else
1207                 err = 0;
1208         fput (file);
1209         return err;
1210
1211 bad_file1:
1212         dput(dentry);
1213 bad_file:
1214         mntput(shm_fs_type.kern_mnt);
1215         unlock_kernel();
1216         if (err == -ENOENT)
1217                 return -EINVAL;
1218         return err;
1219 }
1220
1221 /* This is called by fork, once for every shm attach. */
1222 static void shm_open (struct vm_area_struct *shmd)
1223 {
1224         shm_inc (shmd->vm_file->f_dentry->d_inode->i_ino);
1225 }
1226
1227 /*
1228  *      Remove a name.
1229  */
1230
1231 static int shm_remove_name(int id)
1232 {
1233         struct dentry *dir;
1234         struct dentry *dentry;
1235         int error;
1236         char name[SHM_FMT_LEN+1];
1237
1238         sprintf (name, SHM_FMT, id);
1239         lock_kernel();
1240         dir = lock_parent(shm_sb->s_root);
1241         dentry = lookup_one(name, dir);
1242         error = PTR_ERR(dentry);
1243         if (!IS_ERR(dentry)) {
1244                 /*
1245                  * We have to do our own unlink to prevent the vfs
1246                  * permission check. The SYSV IPC layer has already
1247                  * checked the permissions which do not comply to the
1248                  * vfs rules.
1249                  */
1250                 struct inode *inode = dir->d_inode;
1251                 down(&inode->i_zombie);
1252                 error = shm_unlink(inode, dentry);
1253                 up(&inode->i_zombie);
1254                 dput(dentry);
1255         }
1256         unlock_dir(dir);
1257         unlock_kernel();
1258         return error;
1259 }
1260
1261 /*
1262  * remove the attach descriptor shmd.
1263  * free memory for segment if it is marked destroyed.
1264  * The descriptor has already been removed from the current->mm->mmap list
1265  * and will later be kfree()d.
1266  */
1267 static void shm_close (struct vm_area_struct *shmd)
1268 {
1269         int id = shmd->vm_file->f_dentry->d_inode->i_ino;
1270         struct shmid_kernel *shp;
1271
1272         /* remove from the list of attaches of the shm segment */
1273         if(!(shp = shm_lock(id)))
1274                 BUG();
1275         shp->shm_lprid = current->pid;
1276         shp->shm_dtim = CURRENT_TIME;
1277         shp->shm_nattch--;
1278         if(shp->shm_nattch == 0 &&
1279            shp->shm_flags & PRV_DEST &&
1280            !(shp->shm_flags & SHM_UNLK)) {
1281                 int pid=shp->id;
1282                 int err;
1283                 shm_unlock(id);
1284
1285                 /* The kernel lock prevents new attaches from
1286                  * being happening.  We can't hold shm_lock here
1287                  * else we will deadlock in shm_lookup when we
1288                  * try to recursively grab it.
1289                  */
1290                 err = shm_remove_name(pid);
1291                 if(err && err != -EINVAL && err != -ENOENT)
1292                         printk(KERN_ERR "Unlink of SHM id %d failed (%d).\n", pid, err);
1293
1294         } else {
1295                 shm_unlock(id);
1296         }
1297 }
1298
1299 /*
1300  * detach and kill segment if marked destroyed.
1301  * The work is done in shm_close.
1302  */
1303 asmlinkage long sys_shmdt (char *shmaddr)
1304 {
1305         struct mm_struct *mm = current->mm;
1306         struct vm_area_struct *shmd, *shmdnext;
1307
1308         down(&mm->mmap_sem);
1309         for (shmd = mm->mmap; shmd; shmd = shmdnext) {
1310                 shmdnext = shmd->vm_next;
1311                 if (shmd->vm_ops == &shm_vm_ops
1312                     && shmd->vm_start - (shmd->vm_pgoff << PAGE_SHIFT) == (ulong) shmaddr)
1313                         do_munmap(mm, shmd->vm_start, shmd->vm_end - shmd->vm_start);
1314         }
1315         up(&mm->mmap_sem);
1316         return 0;
1317 }
1318
1319 /*
1320  * Enter the shm page into the SHM data structures.
1321  *
1322  * The way "nopage" is done, we don't actually have to
1323  * do anything here: nopage will have filled in the shm
1324  * data structures already, and shm_swap_out() will just
1325  * work off them..
1326  */
1327 static int shm_swapout(struct page * page, struct file *file)
1328 {
1329         return 0;
1330 }
1331
1332 /*
1333  * page not present ... go through shm_dir
1334  */
1335 static struct page * shm_nopage_core(struct shmid_kernel *shp, unsigned int idx, int *swp, int *rss, unsigned long address)
1336 {
1337         pte_t pte;
1338         struct page * page;
1339
1340         if (idx >= shp->shm_npages)
1341                 return NOPAGE_SIGBUS;
1342
1343         pte = SHM_ENTRY(shp,idx);
1344         if (!pte_present(pte)) {
1345                 /* page not present so shm_swap can't race with us
1346                    and the semaphore protects us by other tasks that
1347                    could potentially fault on our pte under us */
1348                 if (pte_none(pte)) {
1349                         shm_unlock(shp->id);
1350                         page = alloc_page(GFP_HIGHUSER);
1351                         if (!page)
1352                                 goto oom;
1353                         clear_user_highpage(page, address);
1354                         if ((shp != shm_lock(shp->id)) && (shp->id != zero_id))
1355                                 BUG();
1356                 } else {
1357                         swp_entry_t entry = pte_to_swp_entry(pte);
1358
1359                         shm_unlock(shp->id);
1360                         page = lookup_swap_cache(entry);
1361                         if (!page) {
1362                                 lock_kernel();
1363                                 swapin_readahead(entry);
1364                                 page = read_swap_cache(entry);
1365                                 unlock_kernel();
1366                                 if (!page)
1367                                         goto oom;
1368                         }
1369                         delete_from_swap_cache(page);
1370                         page = replace_with_highmem(page);
1371                         swap_free(entry);
1372                         if ((shp != shm_lock(shp->id)) && (shp->id != zero_id))
1373                                 BUG();
1374                         (*swp)--;
1375                 }
1376                 (*rss)++;
1377                 pte = pte_mkdirty(mk_pte(page, PAGE_SHARED));
1378                 SHM_ENTRY(shp, idx) = pte;
1379         }
1380
1381         /* pte_val(pte) == SHM_ENTRY (shp, idx) */
1382         get_page(pte_page(pte));
1383         return pte_page(pte);
1384
1385 oom:
1386         shm_lock(shp->id);
1387         return NOPAGE_OOM;
1388 }
1389
1390 static struct page * shm_nopage(struct vm_area_struct * shmd, unsigned long address, int no_share)
1391 {
1392         struct page * page;
1393         struct shmid_kernel *shp;
1394         unsigned int idx;
1395         struct inode * inode = shmd->vm_file->f_dentry->d_inode;
1396
1397         idx = (address - shmd->vm_start) >> PAGE_SHIFT;
1398         idx += shmd->vm_pgoff;
1399
1400         down(&inode->i_sem);
1401         if(!(shp = shm_lock(inode->i_ino)))
1402                 BUG();
1403         page = shm_nopage_core(shp, idx, &shm_swp, &shm_rss, address);
1404         shm_unlock(inode->i_ino);
1405         up(&inode->i_sem);
1406         return(page);
1407 }
1408
1409 #define OKAY    0
1410 #define RETRY   1
1411 #define FAILED  2
1412
1413 static int shm_swap_core(struct shmid_kernel *shp, unsigned long idx, swp_entry_t swap_entry, int *counter, struct page **outpage)
1414 {
1415         pte_t page;
1416         struct page *page_map;
1417
1418         page = SHM_ENTRY(shp, idx);
1419         if (!pte_present(page))
1420                 return RETRY;
1421         page_map = pte_page(page);
1422         if (page_map->zone->free_pages > page_map->zone->pages_high)
1423                 return RETRY;
1424         if (shp->id != zero_id) swap_attempts++;
1425
1426         if (--counter < 0) /* failed */
1427                 return FAILED;
1428         if (page_count(page_map) != 1)
1429                 return RETRY;
1430
1431         if (!(page_map = prepare_highmem_swapout(page_map)))
1432                 return FAILED;
1433         SHM_ENTRY (shp, idx) = swp_entry_to_pte(swap_entry);
1434
1435         /* add the locked page to the swap cache before allowing
1436            the swapin path to run lookup_swap_cache(). This avoids
1437            reading a not yet uptodate block from disk.
1438            NOTE: we just accounted the swap space reference for this
1439            swap cache page at __get_swap_page() time. */
1440         lock_page(page_map);
1441         add_to_swap_cache(*outpage = page_map, swap_entry);
1442         return OKAY;
1443 }
1444
1445 static void shm_swap_postop(struct page *page)
1446 {
1447         lock_kernel();
1448         rw_swap_page(WRITE, page, 0);
1449         unlock_kernel();
1450         __free_page(page);
1451 }
1452
1453 static int shm_swap_preop(swp_entry_t *swap_entry)
1454 {
1455         lock_kernel();
1456         /* subtle: preload the swap count for the swap cache. We can't
1457            increase the count inside the critical section as we can't release
1458            the shm_lock there. And we can't acquire the big lock with the
1459            shm_lock held (otherwise we would deadlock too easily). */
1460         *swap_entry = __get_swap_page(2);
1461         if (!(*swap_entry).val) {
1462                 unlock_kernel();
1463                 return 1;
1464         }
1465         unlock_kernel();
1466         return 0;
1467 }
1468
1469 /*
1470  * Goes through counter = (shm_rss >> prio) present shm pages.
1471  */
1472 static unsigned long swap_id = 0; /* currently being swapped */
1473 static unsigned long swap_idx = 0; /* next to swap */
1474
1475 int shm_swap (int prio, int gfp_mask)
1476 {
1477         struct shmid_kernel *shp;
1478         swp_entry_t swap_entry;
1479         unsigned long id, idx;
1480         int loop = 0;
1481         int counter;
1482         struct page * page_map;
1483
1484         zshm_swap(prio, gfp_mask);
1485         counter = shm_rss >> prio;
1486         if (!counter)
1487                 return 0;
1488         if (shm_swap_preop(&swap_entry))
1489                 return 0;
1490
1491         shm_lockall();
1492 check_id:
1493         shp = shm_get(swap_id);
1494         if(shp==NULL || shp->shm_flags & SHM_LOCKED) {
1495 next_id:
1496                 swap_idx = 0;
1497                 if (++swap_id > shm_ids.max_id) {
1498                         swap_id = 0;
1499                         if (loop) {
1500 failed:
1501                                 shm_unlockall();
1502                                 __swap_free(swap_entry, 2);
1503                                 return 0;
1504                         }
1505                         loop = 1;
1506                 }
1507                 goto check_id;
1508         }
1509         id = swap_id;
1510
1511 check_table:
1512         idx = swap_idx++;
1513         if (idx >= shp->shm_npages)
1514                 goto next_id;
1515
1516         switch (shm_swap_core(shp, idx, swap_entry, &counter, &page_map)) {
1517                 case RETRY: goto check_table;
1518                 case FAILED: goto failed;
1519         }
1520         swap_successes++;
1521         shm_swp++;
1522         shm_rss--;
1523         shm_unlockall();
1524
1525         shm_swap_postop(page_map);
1526         return 1;
1527 }
1528
1529 /*
1530  * Free the swap entry and set the new pte for the shm page.
1531  */
1532 static void shm_unuse_page(struct shmid_kernel *shp, unsigned long idx,
1533                            swp_entry_t entry, struct page *page)
1534 {
1535         pte_t pte;
1536
1537         pte = pte_mkdirty(mk_pte(page, PAGE_SHARED));
1538         SHM_ENTRY(shp, idx) = pte;
1539         get_page(page);
1540         shm_rss++;
1541
1542         shm_swp--;
1543
1544         swap_free(entry);
1545 }
1546
1547 static int shm_unuse_core(struct shmid_kernel *shp, swp_entry_t entry, struct page *page)
1548 {
1549         int n;
1550
1551         for (n = 0; n < shp->shm_npages; n++) {
1552                 if (pte_none(SHM_ENTRY(shp,n)))
1553                         continue;
1554                 if (pte_present(SHM_ENTRY(shp,n)))
1555                         continue;
1556                 if (pte_to_swp_entry(SHM_ENTRY(shp,n)).val == entry.val) {
1557                         shm_unuse_page(shp, n, entry, page);
1558                         return 1;
1559                 }
1560         }
1561         return 0;
1562 }
1563
1564 /*
1565  * unuse_shm() search for an eventually swapped out shm page.
1566  */
1567 void shm_unuse(swp_entry_t entry, struct page *page)
1568 {
1569         int i;
1570
1571         shm_lockall();
1572         for (i = 0; i <= shm_ids.max_id; i++) {
1573                 struct shmid_kernel *shp = shm_get(i);
1574                 if(shp==NULL)
1575                         continue;
1576                 if (shm_unuse_core(shp, entry, page))
1577                         goto out;
1578         }
1579 out:
1580         shm_unlockall();
1581         zmap_unuse(entry, page);
1582 }
1583
1584 #ifdef CONFIG_PROC_FS
1585 static int sysvipc_shm_read_proc(char *buffer, char **start, off_t offset, int length, int *eof, void *data)
1586 {
1587         off_t pos = 0;
1588         off_t begin = 0;
1589         int i, len = 0;
1590
1591         down(&shm_ids.sem);
1592         len += sprintf(buffer, "       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime name\n");
1593
1594         for(i = 0; i <= shm_ids.max_id; i++) {
1595                 struct shmid_kernel* shp;
1596
1597                 if (i == zero_id)
1598                         continue;
1599                 shp = shm_lock(i);
1600                 if(shp!=NULL) {
1601 #define SMALL_STRING "%10d %10d  %4o %10u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu %.*s%s\n"
1602 #define BIG_STRING   "%10d %10d  %4o %21u %5u %5u  %5d %5u %5u %5u %5u %10lu %10lu %10lu %.*s%s\n"
1603                         char *format;
1604
1605                         if (sizeof(size_t) <= sizeof(int))
1606                                 format = SMALL_STRING;
1607                         else
1608                                 format = BIG_STRING;
1609                         len += sprintf(buffer + len, format,
1610                                 shp->shm_perm.key,
1611                                 shm_buildid(i, shp->shm_perm.seq),
1612                                 shp->shm_flags,
1613                                 shp->shm_segsz,
1614                                 shp->shm_cprid,
1615                                 shp->shm_lprid,
1616                                 shp->shm_nattch,
1617                                 shp->shm_perm.uid,
1618                                 shp->shm_perm.gid,
1619                                 shp->shm_perm.cuid,
1620                                 shp->shm_perm.cgid,
1621                                 shp->shm_atim,
1622                                 shp->shm_dtim,
1623                                 shp->shm_ctim,
1624                                 shp->shm_namelen,
1625                                 shp->shm_name,
1626                                 shp->shm_flags & SHM_UNLK ? " (deleted)" : "");
1627                         shm_unlock(i);
1628
1629                         pos += len;
1630                         if(pos < offset) {
1631                                 len = 0;
1632                                 begin = pos;
1633                         }
1634                         if(pos > offset + length)
1635                                 goto done;
1636                 }
1637         }
1638         *eof = 1;
1639 done:
1640         up(&shm_ids.sem);
1641         *start = buffer + (offset - begin);
1642         len -= (offset - begin);
1643         if(len > length)
1644                 len = length;
1645         if(len < 0)
1646                 len = 0;
1647         return len;
1648 }
1649 #endif
1650
1651 #define VMA_TO_SHP(vma)         ((vma)->vm_file->private_data)
1652
1653 static spinlock_t zmap_list_lock = SPIN_LOCK_UNLOCKED;
1654 static unsigned long zswap_idx = 0; /* next to swap */
1655 static struct shmid_kernel *zswap_shp = &zshmid_kernel;
1656 static int zshm_rss;
1657
1658 static struct vm_operations_struct shmzero_vm_ops = {
1659         open:           shmzero_open,
1660         close:          shmzero_close,
1661         nopage:         shmzero_nopage,
1662         swapout:        shm_swapout,
1663 };
1664
1665 /*
1666  * In this implementation, the "unuse" and "swapout" interfaces are
1667  * interlocked out via the kernel_lock, as well as shm_lock(zero_id).
1668  * "unuse" and "nopage/swapin", as well as "swapout" and "nopage/swapin"
1669  * interlock via shm_lock(zero_id). All these interlocks can be based
1670  * on a per mapping lock instead of being a global lock.
1671  */
1672 /*
1673  * Reference (existance) counting on the file/dentry/inode is done
1674  * by generic vm_file code. The zero code does not hold any reference
1675  * on the pseudo-file. This is possible because the open/close calls
1676  * are bracketed by the file count update calls.
1677  */
1678 static struct file *file_setup(struct file *fzero, struct shmid_kernel *shp)
1679 {
1680         struct file *filp;
1681         struct inode *inp;
1682
1683         if ((filp = get_empty_filp()) == 0)
1684                 return(filp);
1685         if ((inp = get_empty_inode()) == 0) {
1686                 put_filp(filp);
1687                 return(0);
1688         }
1689         if ((filp->f_dentry = d_alloc(zdent, &(const struct qstr) { "dev/zero",
1690                                 8, 0 })) == 0) {
1691                 iput(inp);
1692                 put_filp(filp);
1693                 return(0);
1694         }
1695         filp->f_vfsmnt = mntget(shm_fs_type.kern_mnt);
1696         d_instantiate(filp->f_dentry, inp);
1697
1698         /*
1699          * Copy over dev/ino for benefit of procfs. Use
1700          * ino to indicate seperate mappings.
1701          */
1702         filp->f_dentry->d_inode->i_dev = shm_fs_type.kern_mnt->mnt_sb->s_dev;
1703         filp->f_dentry->d_inode->i_ino = (unsigned long)shp;
1704         if (fzero)
1705                 fput(fzero);    /* release /dev/zero file */
1706         return(filp);
1707 }
1708
1709 int map_zero_setup(struct vm_area_struct *vma)
1710 {
1711         extern int vm_enough_memory(long pages);
1712         struct shmid_kernel *shp;
1713         struct file *filp;
1714
1715         if (!vm_enough_memory((vma->vm_end - vma->vm_start) >> PAGE_SHIFT))
1716                 return -ENOMEM;
1717         if (IS_ERR(shp = seg_alloc((vma->vm_end - vma->vm_start) / PAGE_SIZE, 0)))
1718                 return PTR_ERR(shp);
1719         if ((filp = file_setup(vma->vm_file, shp)) == 0) {
1720                 seg_free(shp, 0);
1721                 return -ENOMEM;
1722         }
1723         vma->vm_file = filp;
1724         VMA_TO_SHP(vma) = (void *)shp;
1725         shp->id = zero_id;
1726         init_MUTEX(&shp->zsem);
1727         vma->vm_ops = &shmzero_vm_ops;
1728         shmzero_open(vma);
1729         spin_lock(&zmap_list_lock);
1730         list_add(&shp->zero_list, &zshmid_kernel.zero_list);
1731         spin_unlock(&zmap_list_lock);
1732         return 0;
1733 }
1734
1735 static void shmzero_open(struct vm_area_struct *shmd)
1736 {
1737         struct shmid_kernel *shp;
1738
1739         shp = VMA_TO_SHP(shmd);
1740         down(&shp->zsem);
1741         shp->shm_nattch++;
1742         up(&shp->zsem);
1743 }
1744
1745 static void shmzero_close(struct vm_area_struct *shmd)
1746 {
1747         int done = 0;
1748         struct shmid_kernel *shp;
1749
1750         shp = VMA_TO_SHP(shmd);
1751         down(&shp->zsem);
1752         if (--shp->shm_nattch == 0)
1753                 done = 1;
1754         up(&shp->zsem);
1755         if (done) {
1756                 spin_lock(&zmap_list_lock);
1757                 if (shp == zswap_shp)
1758                         zswap_shp = list_entry(zswap_shp->zero_list.next,
1759                                                 struct shmid_kernel, zero_list);
1760                 list_del(&shp->zero_list);
1761                 spin_unlock(&zmap_list_lock);
1762                 seg_free(shp, 0);
1763         }
1764 }
1765
1766 static struct page * shmzero_nopage(struct vm_area_struct * shmd, unsigned long address, int no_share)
1767 {
1768         struct page *page;
1769         struct shmid_kernel *shp;
1770         unsigned int idx;
1771         int dummy;
1772
1773         idx = (address - shmd->vm_start) >> PAGE_SHIFT;
1774         idx += shmd->vm_pgoff;
1775
1776         shp = VMA_TO_SHP(shmd);
1777         down(&shp->zsem);
1778         shm_lock(zero_id);
1779         page = shm_nopage_core(shp, idx, &dummy, &zshm_rss, address);
1780         shm_unlock(zero_id);
1781         up(&shp->zsem);
1782         return(page);
1783 }
1784
1785 static void zmap_unuse(swp_entry_t entry, struct page *page)
1786 {
1787         struct shmid_kernel *shp;
1788
1789         spin_lock(&zmap_list_lock);
1790         shm_lock(zero_id);
1791         for (shp = list_entry(zshmid_kernel.zero_list.next, struct shmid_kernel,
1792                         zero_list); shp != &zshmid_kernel;
1793                         shp = list_entry(shp->zero_list.next, struct shmid_kernel,
1794                                                                 zero_list)) {
1795                 if (shm_unuse_core(shp, entry, page))
1796                         break;
1797         }
1798         shm_unlock(zero_id);
1799         spin_unlock(&zmap_list_lock);
1800 }
1801
1802 static void zshm_swap (int prio, int gfp_mask)
1803 {
1804         struct shmid_kernel *shp;
1805         swp_entry_t swap_entry;
1806         unsigned long idx;
1807         int loop = 0;
1808         int counter;
1809         struct page * page_map;
1810
1811         counter = zshm_rss >> prio;
1812         if (!counter)
1813                 return;
1814 next:
1815         if (shm_swap_preop(&swap_entry))
1816                 return;
1817
1818         spin_lock(&zmap_list_lock);
1819         shm_lock(zero_id);
1820         if (zshmid_kernel.zero_list.next == 0)
1821                 goto failed;
1822 next_id:
1823         if (zswap_shp == &zshmid_kernel) {
1824                 if (loop) {
1825 failed:
1826                         shm_unlock(zero_id);
1827                         spin_unlock(&zmap_list_lock);
1828                         __swap_free(swap_entry, 2);
1829                         return;
1830                 }
1831                 zswap_shp = list_entry(zshmid_kernel.zero_list.next,
1832                                         struct shmid_kernel, zero_list);
1833                 zswap_idx = 0;
1834                 loop = 1;
1835         }
1836         shp = zswap_shp;
1837
1838 check_table:
1839         idx = zswap_idx++;
1840         if (idx >= shp->shm_npages) {
1841                 zswap_shp = list_entry(zswap_shp->zero_list.next,
1842                                         struct shmid_kernel, zero_list);
1843                 zswap_idx = 0;
1844                 goto next_id;
1845         }
1846
1847         switch (shm_swap_core(shp, idx, swap_entry, &counter, &page_map)) {
1848                 case RETRY: goto check_table;
1849                 case FAILED: goto failed;
1850         }
1851         shm_unlock(zero_id);
1852         spin_unlock(&zmap_list_lock);
1853
1854         shm_swap_postop(page_map);
1855         if (counter)
1856                 goto next;
1857         return;
1858 }
1859