fs/proc/base.c

   1 /*
   2  *  linux/fs/proc/base.c
   3  *
   4  *  Copyright (C) 1991, 1992 Linus Torvalds
   5  *
   6  *  proc base directory handling functions
   7  *
   8  *  1999, Al Viro. Rewritten. Now it covers the whole per-process part.
   9  *  Instead of using magical inumbers to determine the kind of object
  10  *  we allocate and fill in-core inodes upon lookup. They don't even
  11  *  go into icache. We cache the reference to task_struct upon lookup too.
  12  *  Eventually it should become a filesystem in its own. We don't use the
  13  *  rest of procfs anymore.
  14  *
  15  *
  16  *  Changelog:
  17  *  17-Jan-2005
  18  *  Allan Bezerra
  19  *  Bruna Moreira <bruna.moreira@indt.org.br>
  20  *  Edjard Mota <edjard.mota@indt.org.br>
  21  *  Ilias Biris <ilias.biris@indt.org.br>
  22  *  Mauricio Lin <mauricio.lin@indt.org.br>
  23  *
  24  *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
  25  *
  26  *  A new process specific entry (smaps) included in /proc. It shows the
  27  *  size of rss for each memory area. The maps entry lacks information
  28  *  about physical memory size (rss) for each mapped file, i.e.,
  29  *  rss information for executables and library files.
  30  *  This additional information is useful for any tools that need to know
  31  *  about physical memory consumption for a process specific library.
  32  *
  33  *  Changelog:
  34  *  21-Feb-2005
  35  *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
  36  *  Pud inclusion in the page table walking.
  37  *
  38  *  ChangeLog:
  39  *  10-Mar-2005
  40  *  10LE Instituto Nokia de Tecnologia - INdT:
  41  *  A better way to walks through the page table as suggested by Hugh Dickins.
  42  *
  43  *  Simo Piiroinen <simo.piiroinen@nokia.com>:
  44  *  Smaps information related to shared, private, clean and dirty pages.
  45  *
  46  *  Paul Mundt <paul.mundt@nokia.com>:
  47  *  Overall revision about smaps.
  48  */
  49
  50 #include <asm/uaccess.h>
  51
  52 #include <linux/errno.h>
  53 #include <linux/time.h>
  54 #include <linux/proc_fs.h>
  55 #include <linux/stat.h>
  56 #include <linux/init.h>
  57 #include <linux/capability.h>
  58 #include <linux/file.h>
  59 #include <linux/fdtable.h>
  60 #include <linux/string.h>
  61 #include <linux/seq_file.h>
  62 #include <linux/namei.h>
  63 #include <linux/mnt_namespace.h>
  64 #include <linux/mm.h>
  65 #include <linux/rcupdate.h>
  66 #include <linux/kallsyms.h>
  67 #include <linux/resource.h>
  68 #include <linux/module.h>
  69 #include <linux/mount.h>
  70 #include <linux/security.h>
  71 #include <linux/ptrace.h>
  72 #include <linux/cgroup.h>
  73 #include <linux/cpuset.h>
  74 #include <linux/audit.h>
  75 #include <linux/poll.h>
  76 #include <linux/nsproxy.h>
  77 #include <linux/oom.h>
  78 #include <linux/elf.h>
  79 #include <linux/pid_namespace.h>
  80 #include "internal.h"
  81
  82 /* NOTE:
  83  *      Implementing inode permission operations in /proc is almost
  84  *      certainly an error.  Permission checks need to happen during
  85  *      each system call not at open time.  The reason is that most of
  86  *      what we wish to check for permissions in /proc varies at runtime.
  87  *
  88  *      The classic example of a problem is opening file descriptors
  89  *      in /proc for a task before it execs a suid executable.
  90  */
  91
  92 struct pid_entry {
  93         char *name;
  94         int len;
  95         mode_t mode;
  96         const struct inode_operations *iop;
  97         const struct file_operations *fop;
  98         union proc_op op;
  99 };
 100
 101 #define NOD(NAME, MODE, IOP, FOP, OP) {                 \
 102         .name = (NAME),                                 \
 103         .len  = sizeof(NAME) - 1,                       \
 104         .mode = MODE,                                   \
 105         .iop  = IOP,                                    \
 106         .fop  = FOP,                                    \
 107         .op   = OP,                                     \
 108 }
 109
 110 #define DIR(NAME, MODE, OTYPE)                                                  \
 111         NOD(NAME, (S_IFDIR|(MODE)),                                             \
 112                 &proc_##OTYPE##_inode_operations, &proc_##OTYPE##_operations,   \
 113                 {} )
 114 #define LNK(NAME, OTYPE)                                        \
 115         NOD(NAME, (S_IFLNK|S_IRWXUGO),                          \
 116                 &proc_pid_link_inode_operations, NULL,          \
 117                 { .proc_get_link = &proc_##OTYPE##_link } )
 118 #define REG(NAME, MODE, OTYPE)                          \
 119         NOD(NAME, (S_IFREG|(MODE)), NULL,               \
 120                 &proc_##OTYPE##_operations, {})
 121 #define INF(NAME, MODE, OTYPE)                          \
 122         NOD(NAME, (S_IFREG|(MODE)),                     \
 123                 NULL, &proc_info_file_operations,       \
 124                 { .proc_read = &proc_##OTYPE } )
 125 #define ONE(NAME, MODE, OTYPE)                          \
 126         NOD(NAME, (S_IFREG|(MODE)),                     \
 127                 NULL, &proc_single_file_operations,     \
 128                 { .proc_show = &proc_##OTYPE } )
 129
 130 int maps_protect;
 131 EXPORT_SYMBOL(maps_protect);
 132
 133 static struct fs_struct *get_fs_struct(struct task_struct *task)
 134 {
 135         struct fs_struct *fs;
 136         task_lock(task);
 137         fs = task->fs;
 138         if(fs)
 139                 atomic_inc(&fs->count);
 140         task_unlock(task);
 141         return fs;
 142 }
 143
 144 static int get_nr_threads(struct task_struct *tsk)
 145 {
 146         /* Must be called with the rcu_read_lock held */
 147         unsigned long flags;
 148         int count = 0;
 149
 150         if (lock_task_sighand(tsk, &flags)) {
 151                 count = atomic_read(&tsk->signal->count);
 152                 unlock_task_sighand(tsk, &flags);
 153         }
 154         return count;
 155 }
 156
 157 static int proc_cwd_link(struct inode *inode, struct path *path)
 158 {
 159         struct task_struct *task = get_proc_task(inode);
 160         struct fs_struct *fs = NULL;
 161         int result = -ENOENT;
 162
 163         if (task) {
 164                 fs = get_fs_struct(task);
 165                 put_task_struct(task);
 166         }
 167         if (fs) {
 168                 read_lock(&fs->lock);
 169                 *path = fs->pwd;
 170                 path_get(&fs->pwd);
 171                 read_unlock(&fs->lock);
 172                 result = 0;
 173                 put_fs_struct(fs);
 174         }
 175         return result;
 176 }
 177
 178 static int proc_root_link(struct inode *inode, struct path *path)
 179 {
 180         struct task_struct *task = get_proc_task(inode);
 181         struct fs_struct *fs = NULL;
 182         int result = -ENOENT;
 183
 184         if (task) {
 185                 fs = get_fs_struct(task);
 186                 put_task_struct(task);
 187         }
 188         if (fs) {
 189                 read_lock(&fs->lock);
 190                 *path = fs->root;
 191                 path_get(&fs->root);
 192                 read_unlock(&fs->lock);
 193                 result = 0;
 194                 put_fs_struct(fs);
 195         }
 196         return result;
 197 }
 198
 199 /*
 200  * Return zero if current may access user memory in @task, -error if not.
 201  */
 202 static int check_mem_permission(struct task_struct *task)
 203 {
 204         /*
 205          * A task can always look at itself, in case it chooses
 206          * to use system calls instead of load instructions.
 207          */
 208         if (task == current)
 209                 return 0;
 210
 211         /*
 212          * If current is actively ptrace'ing, and would also be
 213          * permitted to freshly attach with ptrace now, permit it.
 214          */
 215         if (task->parent == current && (task->ptrace & PT_PTRACED) &&
 216             task_is_stopped_or_traced(task) &&
 217             ptrace_may_attach(task))
 218                 return 0;
 219
 220         /*
 221          * Noone else is allowed.
 222          */
 223         return -EPERM;
 224 }
 225
 226 struct mm_struct *mm_for_maps(struct task_struct *task)
 227 {
 228         struct mm_struct *mm = get_task_mm(task);
 229         if (!mm)
 230                 return NULL;
 231         down_read(&mm->mmap_sem);
 232         task_lock(task);
 233         if (task->mm != mm)
 234                 goto out;
 235         if (task->mm != current->mm && __ptrace_may_attach(task) < 0)
 236                 goto out;
 237         task_unlock(task);
 238         return mm;
 239 out:
 240         task_unlock(task);
 241         up_read(&mm->mmap_sem);
 242         mmput(mm);
 243         return NULL;
 244 }
 245
 246 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
 247 {
 248         int res = 0;
 249         unsigned int len;
 250         struct mm_struct *mm = get_task_mm(task);
 251         if (!mm)
 252                 goto out;
 253         if (!mm->arg_end)
 254                 goto out_mm;    /* Shh! No looking before we're done */
 255
 256         len = mm->arg_end - mm->arg_start;
 257
 258         if (len > PAGE_SIZE)
 259                 len = PAGE_SIZE;
 260
 261         res = access_process_vm(task, mm->arg_start, buffer, len, 0);
 262
 263         // If the nul at the end of args has been overwritten, then
 264         // assume application is using setproctitle(3).
 265         if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) {
 266                 len = strnlen(buffer, res);
 267                 if (len < res) {
 268                     res = len;
 269                 } else {
 270                         len = mm->env_end - mm->env_start;
 271                         if (len > PAGE_SIZE - res)
 272                                 len = PAGE_SIZE - res;
 273                         res += access_process_vm(task, mm->env_start, buffer+res, len, 0);
 274                         res = strnlen(buffer, res);
 275                 }
 276         }
 277 out_mm:
 278         mmput(mm);
 279 out:
 280         return res;
 281 }
 282
 283 static int proc_pid_auxv(struct task_struct *task, char *buffer)
 284 {
 285         int res = 0;
 286         struct mm_struct *mm = get_task_mm(task);
 287         if (mm) {
 288                 unsigned int nwords = 0;
 289                 do
 290                         nwords += 2;
 291                 while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
 292                 res = nwords * sizeof(mm->saved_auxv[0]);
 293                 if (res > PAGE_SIZE)
 294                         res = PAGE_SIZE;
 295                 memcpy(buffer, mm->saved_auxv, res);
 296                 mmput(mm);
 297         }
 298         return res;
 299 }
 300
 301
 302 #ifdef CONFIG_KALLSYMS
 303 /*
 304  * Provides a wchan file via kallsyms in a proper one-value-per-file format.
 305  * Returns the resolved symbol.  If that fails, simply return the address.
 306  */
 307 static int proc_pid_wchan(struct task_struct *task, char *buffer)
 308 {
 309         unsigned long wchan;
 310         char symname[KSYM_NAME_LEN];
 311
 312         wchan = get_wchan(task);
 313
 314         if (lookup_symbol_name(wchan, symname) < 0)
 315                 return sprintf(buffer, "%lu", wchan);
 316         else
 317                 return sprintf(buffer, "%s", symname);
 318 }
 319 #endif /* CONFIG_KALLSYMS */
 320
 321 #ifdef CONFIG_SCHEDSTATS
 322 /*
 323  * Provides /proc/PID/schedstat
 324  */
 325 static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 326 {
 327         return sprintf(buffer, "%llu %llu %lu\n",
 328                         task->sched_info.cpu_time,
 329                         task->sched_info.run_delay,
 330                         task->sched_info.pcount);
 331 }
 332 #endif
 333
 334 #ifdef CONFIG_LATENCYTOP
 335 static int lstats_show_proc(struct seq_file *m, void *v)
 336 {
 337         int i;
 338         struct inode *inode = m->private;
 339         struct task_struct *task = get_proc_task(inode);
 340
 341         if (!task)
 342                 return -ESRCH;
 343         seq_puts(m, "Latency Top version : v0.1\n");
 344         for (i = 0; i < 32; i++) {
 345                 if (task->latency_record[i].backtrace[0]) {
 346                         int q;
 347                         seq_printf(m, "%i %li %li ",
 348                                 task->latency_record[i].count,
 349                                 task->latency_record[i].time,
 350                                 task->latency_record[i].max);
 351                         for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
 352                                 char sym[KSYM_NAME_LEN];
 353                                 char *c;
 354                                 if (!task->latency_record[i].backtrace[q])
 355                                         break;
 356                                 if (task->latency_record[i].backtrace[q] == ULONG_MAX)
 357                                         break;
 358                                 sprint_symbol(sym, task->latency_record[i].backtrace[q]);
 359                                 c = strchr(sym, '+');
 360                                 if (c)
 361                                         *c = 0;
 362                                 seq_printf(m, "%s ", sym);
 363                         }
 364                         seq_printf(m, "\n");
 365                 }
 366
 367         }
 368         put_task_struct(task);
 369         return 0;
 370 }
 371
 372 static int lstats_open(struct inode *inode, struct file *file)
 373 {
 374         return single_open(file, lstats_show_proc, inode);
 375 }
 376
 377 static ssize_t lstats_write(struct file *file, const char __user *buf,
 378                             size_t count, loff_t *offs)
 379 {
 380         struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
 381
 382         if (!task)
 383                 return -ESRCH;
 384         clear_all_latency_tracing(task);
 385         put_task_struct(task);
 386
 387         return count;
 388 }
 389
 390 static const struct file_operations proc_lstats_operations = {
 391         .open           = lstats_open,
 392         .read           = seq_read,
 393         .write          = lstats_write,
 394         .llseek         = seq_lseek,
 395         .release        = single_release,
 396 };
 397
 398 #endif
 399
 400 /* The badness from the OOM killer */
 401 unsigned long badness(struct task_struct *p, unsigned long uptime);
 402 static int proc_oom_score(struct task_struct *task, char *buffer)
 403 {
 404         unsigned long points;
 405         struct timespec uptime;
 406
 407         do_posix_clock_monotonic_gettime(&uptime);
 408         read_lock(&tasklist_lock);
 409         points = badness(task, uptime.tv_sec);
 410         read_unlock(&tasklist_lock);
 411         return sprintf(buffer, "%lu\n", points);
 412 }
 413
 414 struct limit_names {
 415         char *name;
 416         char *unit;
 417 };
 418
 419 static const struct limit_names lnames[RLIM_NLIMITS] = {
 420         [RLIMIT_CPU] = {"Max cpu time", "ms"},
 421         [RLIMIT_FSIZE] = {"Max file size", "bytes"},
 422         [RLIMIT_DATA] = {"Max data size", "bytes"},
 423         [RLIMIT_STACK] = {"Max stack size", "bytes"},
 424         [RLIMIT_CORE] = {"Max core file size", "bytes"},
 425         [RLIMIT_RSS] = {"Max resident set", "bytes"},
 426         [RLIMIT_NPROC] = {"Max processes", "processes"},
 427         [RLIMIT_NOFILE] = {"Max open files", "files"},
 428         [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"},
 429         [RLIMIT_AS] = {"Max address space", "bytes"},
 430         [RLIMIT_LOCKS] = {"Max file locks", "locks"},
 431         [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"},
 432         [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
 433         [RLIMIT_NICE] = {"Max nice priority", NULL},
 434         [RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
 435         [RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
 436 };
 437
 438 /* Display limits for a process */
 439 static int proc_pid_limits(struct task_struct *task, char *buffer)
 440 {
 441         unsigned int i;
 442         int count = 0;
 443         unsigned long flags;
 444         char *bufptr = buffer;
 445
 446         struct rlimit rlim[RLIM_NLIMITS];
 447
 448         rcu_read_lock();
 449         if (!lock_task_sighand(task,&flags)) {
 450                 rcu_read_unlock();
 451                 return 0;
 452         }
 453         memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
 454         unlock_task_sighand(task, &flags);
 455         rcu_read_unlock();
 456
 457         /*
 458          * print the file header
 459          */
 460         count += sprintf(&bufptr[count], "%-25s %-20s %-20s %-10s\n",
 461                         "Limit", "Soft Limit", "Hard Limit", "Units");
 462
 463         for (i = 0; i < RLIM_NLIMITS; i++) {
 464                 if (rlim[i].rlim_cur == RLIM_INFINITY)
 465                         count += sprintf(&bufptr[count], "%-25s %-20s ",
 466                                          lnames[i].name, "unlimited");
 467                 else
 468                         count += sprintf(&bufptr[count], "%-25s %-20lu ",
 469                                          lnames[i].name, rlim[i].rlim_cur);
 470
 471                 if (rlim[i].rlim_max == RLIM_INFINITY)
 472                         count += sprintf(&bufptr[count], "%-20s ", "unlimited");
 473                 else
 474                         count += sprintf(&bufptr[count], "%-20lu ",
 475                                          rlim[i].rlim_max);
 476
 477                 if (lnames[i].unit)
 478                         count += sprintf(&bufptr[count], "%-10s\n",
 479                                          lnames[i].unit);
 480                 else
 481                         count += sprintf(&bufptr[count], "\n");
 482         }
 483
 484         return count;
 485 }
 486
 487 /************************************************************************/
 488 /*                       Here the fs part begins                        */
 489 /************************************************************************/
 490
 491 /* permission checks */
 492 static int proc_fd_access_allowed(struct inode *inode)
 493 {
 494         struct task_struct *task;
 495         int allowed = 0;
 496         /* Allow access to a task's file descriptors if it is us or we
 497          * may use ptrace attach to the process and find out that
 498          * information.
 499          */
 500         task = get_proc_task(inode);
 501         if (task) {
 502                 allowed = ptrace_may_attach(task);
 503                 put_task_struct(task);
 504         }
 505         return allowed;
 506 }
 507
 508 static int proc_setattr(struct dentry *dentry, struct iattr *attr)
 509 {
 510         int error;
 511         struct inode *inode = dentry->d_inode;
 512
 513         if (attr->ia_valid & ATTR_MODE)
 514                 return -EPERM;
 515
 516         error = inode_change_ok(inode, attr);
 517         if (!error)
 518                 error = inode_setattr(inode, attr);
 519         return error;
 520 }
 521
 522 static const struct inode_operations proc_def_inode_operations = {
 523         .setattr        = proc_setattr,
 524 };
 525
 526 static int mounts_open_common(struct inode *inode, struct file *file,
 527                               const struct seq_operations *op)
 528 {
 529         struct task_struct *task = get_proc_task(inode);
 530         struct nsproxy *nsp;
 531         struct mnt_namespace *ns = NULL;
 532         struct fs_struct *fs = NULL;
 533         struct path root;
 534         struct proc_mounts *p;
 535         int ret = -EINVAL;
 536
 537         if (task) {
 538                 rcu_read_lock();
 539                 nsp = task_nsproxy(task);
 540                 if (nsp) {
 541                         ns = nsp->mnt_ns;
 542                         if (ns)
 543                                 get_mnt_ns(ns);
 544                 }
 545                 rcu_read_unlock();
 546                 if (ns)
 547                         fs = get_fs_struct(task);
 548                 put_task_struct(task);
 549         }
 550
 551         if (!ns)
 552                 goto err;
 553         if (!fs)
 554                 goto err_put_ns;
 555
 556         read_lock(&fs->lock);
 557         root = fs->root;
 558         path_get(&root);
 559         read_unlock(&fs->lock);
 560         put_fs_struct(fs);
 561
 562         ret = -ENOMEM;
 563         p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
 564         if (!p)
 565                 goto err_put_path;
 566
 567         file->private_data = &p->m;
 568         ret = seq_open(file, op);
 569         if (ret)
 570                 goto err_free;
 571
 572         p->m.private = p;
 573         p->ns = ns;
 574         p->root = root;
 575         p->event = ns->event;
 576
 577         return 0;
 578
 579  err_free:
 580         kfree(p);
 581  err_put_path:
 582         path_put(&root);
 583  err_put_ns:
 584         put_mnt_ns(ns);
 585  err:
 586         return ret;
 587 }
 588
 589 static int mounts_release(struct inode *inode, struct file *file)
 590 {
 591         struct proc_mounts *p = file->private_data;
 592         path_put(&p->root);
 593         put_mnt_ns(p->ns);
 594         return seq_release(inode, file);
 595 }
 596
 597 static unsigned mounts_poll(struct file *file, poll_table *wait)
 598 {
 599         struct proc_mounts *p = file->private_data;
 600         struct mnt_namespace *ns = p->ns;
 601         unsigned res = 0;
 602
 603         poll_wait(file, &ns->poll, wait);
 604
 605         spin_lock(&vfsmount_lock);
 606         if (p->event != ns->event) {
 607                 p->event = ns->event;
 608                 res = POLLERR;
 609         }
 610         spin_unlock(&vfsmount_lock);
 611
 612         return res;
 613 }
 614
 615 static int mounts_open(struct inode *inode, struct file *file)
 616 {
 617         return mounts_open_common(inode, file, &mounts_op);
 618 }
 619
 620 static const struct file_operations proc_mounts_operations = {
 621         .open           = mounts_open,
 622         .read           = seq_read,
 623         .llseek         = seq_lseek,
 624         .release        = mounts_release,
 625         .poll           = mounts_poll,
 626 };
 627
 628 static int mountinfo_open(struct inode *inode, struct file *file)
 629 {
 630         return mounts_open_common(inode, file, &mountinfo_op);
 631 }
 632
 633 static const struct file_operations proc_mountinfo_operations = {
 634         .open           = mountinfo_open,
 635         .read           = seq_read,
 636         .llseek         = seq_lseek,
 637         .release        = mounts_release,
 638         .poll           = mounts_poll,
 639 };
 640
 641 static int mountstats_open(struct inode *inode, struct file *file)
 642 {
 643         return mounts_open_common(inode, file, &mountstats_op);
 644 }
 645
 646 static const struct file_operations proc_mountstats_operations = {
 647         .open           = mountstats_open,
 648         .read           = seq_read,
 649         .llseek         = seq_lseek,
 650         .release        = mounts_release,
 651 };
 652
 653 #define PROC_BLOCK_SIZE (3*1024)                /* 4K page size but our output routines use some slack for overruns */
 654
 655 static ssize_t proc_info_read(struct file * file, char __user * buf,
 656                           size_t count, loff_t *ppos)
 657 {
 658         struct inode * inode = file->f_path.dentry->d_inode;
 659         unsigned long page;
 660         ssize_t length;
 661         struct task_struct *task = get_proc_task(inode);
 662
 663         length = -ESRCH;
 664         if (!task)
 665                 goto out_no_task;
 666
 667         if (count > PROC_BLOCK_SIZE)
 668                 count = PROC_BLOCK_SIZE;
 669
 670         length = -ENOMEM;
 671         if (!(page = __get_free_page(GFP_TEMPORARY)))
 672                 goto out;
 673
 674         length = PROC_I(inode)->op.proc_read(task, (char*)page);
 675
 676         if (length >= 0)
 677                 length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
 678         free_page(page);
 679 out:
 680         put_task_struct(task);
 681 out_no_task:
 682         return length;
 683 }
 684
 685 static const struct file_operations proc_info_file_operations = {
 686         .read           = proc_info_read,
 687 };
 688
 689 static int proc_single_show(struct seq_file *m, void *v)
 690 {
 691         struct inode *inode = m->private;
 692         struct pid_namespace *ns;
 693         struct pid *pid;
 694         struct task_struct *task;
 695         int ret;
 696
 697         ns = inode->i_sb->s_fs_info;
 698         pid = proc_pid(inode);
 699         task = get_pid_task(pid, PIDTYPE_PID);
 700         if (!task)
 701                 return -ESRCH;
 702
 703         ret = PROC_I(inode)->op.proc_show(m, ns, pid, task);
 704
 705         put_task_struct(task);
 706         return ret;
 707 }
 708
 709 static int proc_single_open(struct inode *inode, struct file *filp)
 710 {
 711         int ret;
 712         ret = single_open(filp, proc_single_show, NULL);
 713         if (!ret) {
 714                 struct seq_file *m = filp->private_data;
 715
 716                 m->private = inode;
 717         }
 718         return ret;
 719 }
 720
 721 static const struct file_operations proc_single_file_operations = {
 722         .open           = proc_single_open,
 723         .read           = seq_read,
 724         .llseek         = seq_lseek,
 725         .release        = single_release,
 726 };
 727
 728 static int mem_open(struct inode* inode, struct file* file)
 729 {
 730         file->private_data = (void*)((long)current->self_exec_id);
 731         return 0;
 732 }
 733
 734 static ssize_t mem_read(struct file * file, char __user * buf,
 735                         size_t count, loff_t *ppos)
 736 {
 737         struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
 738         char *page;
 739         unsigned long src = *ppos;
 740         int ret = -ESRCH;
 741         struct mm_struct *mm;
 742
 743         if (!task)
 744                 goto out_no_task;
 745
 746         if (check_mem_permission(task))
 747                 goto out;
 748
 749         ret = -ENOMEM;
 750         page = (char *)__get_free_page(GFP_TEMPORARY);
 751         if (!page)
 752                 goto out;
 753
 754         ret = 0;
 755
 756         mm = get_task_mm(task);
 757         if (!mm)
 758                 goto out_free;
 759
 760         ret = -EIO;
 761
 762         if (file->private_data != (void*)((long)current->self_exec_id))
 763                 goto out_put;
 764
 765         ret = 0;
 766
 767         while (count > 0) {
 768                 int this_len, retval;
 769
 770                 this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
 771                 retval = access_process_vm(task, src, page, this_len, 0);
 772                 if (!retval || check_mem_permission(task)) {
 773                         if (!ret)
 774                                 ret = -EIO;
 775                         break;
 776                 }
 777
 778                 if (copy_to_user(buf, page, retval)) {
 779                         ret = -EFAULT;
 780                         break;
 781                 }
 782
 783                 ret += retval;
 784                 src += retval;
 785                 buf += retval;
 786                 count -= retval;
 787         }
 788         *ppos = src;
 789
 790 out_put:
 791         mmput(mm);
 792 out_free:
 793         free_page((unsigned long) page);
 794 out:
 795         put_task_struct(task);
 796 out_no_task:
 797         return ret;
 798 }
 799
 800 #define mem_write NULL
 801
 802 #ifndef mem_write
 803 /* This is a security hazard */
 804 static ssize_t mem_write(struct file * file, const char __user *buf,
 805                          size_t count, loff_t *ppos)
 806 {
 807         int copied;
 808         char *page;
 809         struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
 810         unsigned long dst = *ppos;
 811
 812         copied = -ESRCH;
 813         if (!task)
 814                 goto out_no_task;
 815
 816         if (check_mem_permission(task))
 817                 goto out;
 818
 819         copied = -ENOMEM;
 820         page = (char *)__get_free_page(GFP_TEMPORARY);
 821         if (!page)
 822                 goto out;
 823
 824         copied = 0;
 825         while (count > 0) {
 826                 int this_len, retval;
 827
 828                 this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
 829                 if (copy_from_user(page, buf, this_len)) {
 830                         copied = -EFAULT;
 831                         break;
 832                 }
 833                 retval = access_process_vm(task, dst, page, this_len, 1);
 834                 if (!retval) {
 835                         if (!copied)
 836                                 copied = -EIO;
 837                         break;
 838                 }
 839                 copied += retval;
 840                 buf += retval;
 841                 dst += retval;
 842                 count -= retval;
 843         }
 844         *ppos = dst;
 845         free_page((unsigned long) page);
 846 out:
 847         put_task_struct(task);
 848 out_no_task:
 849         return copied;
 850 }
 851 #endif
 852
 853 loff_t mem_lseek(struct file *file, loff_t offset, int orig)
 854 {
 855         switch (orig) {
 856         case 0:
 857                 file->f_pos = offset;
 858                 break;
 859         case 1:
 860                 file->f_pos += offset;
 861                 break;
 862         default:
 863                 return -EINVAL;
 864         }
 865         force_successful_syscall_return();
 866         return file->f_pos;
 867 }
 868
 869 static const struct file_operations proc_mem_operations = {
 870         .llseek         = mem_lseek,
 871         .read           = mem_read,
 872         .write          = mem_write,
 873         .open           = mem_open,
 874 };
 875
 876 static ssize_t environ_read(struct file *file, char __user *buf,
 877                         size_t count, loff_t *ppos)
 878 {
 879         struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
 880         char *page;
 881         unsigned long src = *ppos;
 882         int ret = -ESRCH;
 883         struct mm_struct *mm;
 884
 885         if (!task)
 886                 goto out_no_task;
 887
 888         if (!ptrace_may_attach(task))
 889                 goto out;
 890
 891         ret = -ENOMEM;
 892         page = (char *)__get_free_page(GFP_TEMPORARY);
 893         if (!page)
 894                 goto out;
 895
 896         ret = 0;
 897
 898         mm = get_task_mm(task);
 899         if (!mm)
 900                 goto out_free;
 901
 902         while (count > 0) {
 903                 int this_len, retval, max_len;
 904
 905                 this_len = mm->env_end - (mm->env_start + src);
 906
 907                 if (this_len <= 0)
 908                         break;
 909
 910                 max_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
 911                 this_len = (this_len > max_len) ? max_len : this_len;
 912
 913                 retval = access_process_vm(task, (mm->env_start + src),
 914                         page, this_len, 0);
 915
 916                 if (retval <= 0) {
 917                         ret = retval;
 918                         break;
 919                 }
 920
 921                 if (copy_to_user(buf, page, retval)) {
 922                         ret = -EFAULT;
 923                         break;
 924                 }
 925
 926                 ret += retval;
 927                 src += retval;
 928                 buf += retval;
 929                 count -= retval;
 930         }
 931         *ppos = src;
 932
 933         mmput(mm);
 934 out_free:
 935         free_page((unsigned long) page);
 936 out:
 937         put_task_struct(task);
 938 out_no_task:
 939         return ret;
 940 }
 941
 942 static const struct file_operations proc_environ_operations = {
 943         .read           = environ_read,
 944 };
 945
 946 static ssize_t oom_adjust_read(struct file *file, char __user *buf,
 947                                 size_t count, loff_t *ppos)
 948 {
 949         struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
 950         char buffer[PROC_NUMBUF];
 951         size_t len;
 952         int oom_adjust;
 953
 954         if (!task)
 955                 return -ESRCH;
 956         oom_adjust = task->oomkilladj;
 957         put_task_struct(task);
 958
 959         len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
 960
 961         return simple_read_from_buffer(buf, count, ppos, buffer, len);
 962 }
 963
 964 static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
 965                                 size_t count, loff_t *ppos)
 966 {
 967         struct task_struct *task;
 968         char buffer[PROC_NUMBUF], *end;
 969         int oom_adjust;
 970
 971         memset(buffer, 0, sizeof(buffer));
 972         if (count > sizeof(buffer) - 1)
 973                 count = sizeof(buffer) - 1;
 974         if (copy_from_user(buffer, buf, count))
 975                 return -EFAULT;
 976         oom_adjust = simple_strtol(buffer, &end, 0);
 977         if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
 978              oom_adjust != OOM_DISABLE)
 979                 return -EINVAL;
 980         if (*end == '\n')
 981                 end++;
 982         task = get_proc_task(file->f_path.dentry->d_inode);
 983         if (!task)
 984                 return -ESRCH;
 985         if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) {
 986                 put_task_struct(task);
 987                 return -EACCES;
 988         }
 989         task->oomkilladj = oom_adjust;
 990         put_task_struct(task);
 991         if (end - buffer == 0)
 992                 return -EIO;
 993         return end - buffer;
 994 }
 995
 996 static const struct file_operations proc_oom_adjust_operations = {
 997         .read           = oom_adjust_read,
 998         .write          = oom_adjust_write,
 999 };
1000
1001 #ifdef CONFIG_AUDITSYSCALL
1002 #define TMPBUFLEN 21
1003 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
1004                                   size_t count, loff_t *ppos)
1005 {
1006         struct inode * inode = file->f_path.dentry->d_inode;
1007         struct task_struct *task = get_proc_task(inode);
1008         ssize_t length;
1009         char tmpbuf[TMPBUFLEN];
1010
1011         if (!task)
1012                 return -ESRCH;
1013         length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1014                                 audit_get_loginuid(task));
1015         put_task_struct(task);
1016         return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1017 }
1018
1019 static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1020                                    size_t count, loff_t *ppos)
1021 {
1022         struct inode * inode = file->f_path.dentry->d_inode;
1023         char *page, *tmp;
1024         ssize_t length;
1025         uid_t loginuid;
1026
1027         if (!capable(CAP_AUDIT_CONTROL))
1028                 return -EPERM;
1029
1030         if (current != pid_task(proc_pid(inode), PIDTYPE_PID))
1031                 return -EPERM;
1032
1033         if (count >= PAGE_SIZE)
1034                 count = PAGE_SIZE - 1;
1035
1036         if (*ppos != 0) {
1037                 /* No partial writes. */
1038                 return -EINVAL;
1039         }
1040         page = (char*)__get_free_page(GFP_TEMPORARY);
1041         if (!page)
1042                 return -ENOMEM;
1043         length = -EFAULT;
1044         if (copy_from_user(page, buf, count))
1045                 goto out_free_page;
1046
1047         page[count] = '\0';
1048         loginuid = simple_strtoul(page, &tmp, 10);
1049         if (tmp == page) {
1050                 length = -EINVAL;
1051                 goto out_free_page;
1052
1053         }
1054         length = audit_set_loginuid(current, loginuid);
1055         if (likely(length == 0))
1056                 length = count;
1057
1058 out_free_page:
1059         free_page((unsigned long) page);
1060         return length;
1061 }
1062
1063 static const struct file_operations proc_loginuid_operations = {
1064         .read           = proc_loginuid_read,
1065         .write          = proc_loginuid_write,
1066 };
1067
1068 static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
1069                                   size_t count, loff_t *ppos)
1070 {
1071         struct inode * inode = file->f_path.dentry->d_inode;
1072         struct task_struct *task = get_proc_task(inode);
1073         ssize_t length;
1074         char tmpbuf[TMPBUFLEN];
1075
1076         if (!task)
1077                 return -ESRCH;
1078         length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1079                                 audit_get_sessionid(task));
1080         put_task_struct(task);
1081         return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1082 }
1083
1084 static const struct file_operations proc_sessionid_operations = {
1085         .read           = proc_sessionid_read,
1086 };
1087 #endif
1088
1089 #ifdef CONFIG_FAULT_INJECTION
1090 static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
1091                                       size_t count, loff_t *ppos)
1092 {
1093         struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
1094         char buffer[PROC_NUMBUF];
1095         size_t len;
1096         int make_it_fail;
1097
1098         if (!task)
1099                 return -ESRCH;
1100         make_it_fail = task->make_it_fail;
1101         put_task_struct(task);
1102
1103         len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);
1104
1105         return simple_read_from_buffer(buf, count, ppos, buffer, len);
1106 }
1107
1108 static ssize_t proc_fault_inject_write(struct file * file,
1109                         const char __user * buf, size_t count, loff_t *ppos)
1110 {
1111         struct task_struct *task;
1112         char buffer[PROC_NUMBUF], *end;
1113         int make_it_fail;
1114
1115         if (!capable(CAP_SYS_RESOURCE))
1116                 return -EPERM;
1117         memset(buffer, 0, sizeof(buffer));
1118         if (count > sizeof(buffer) - 1)
1119                 count = sizeof(buffer) - 1;
1120         if (copy_from_user(buffer, buf, count))
1121                 return -EFAULT;
1122         make_it_fail = simple_strtol(buffer, &end, 0);
1123         if (*end == '\n')
1124                 end++;
1125         task = get_proc_task(file->f_dentry->d_inode);
1126         if (!task)
1127                 return -ESRCH;
1128         task->make_it_fail = make_it_fail;
1129         put_task_struct(task);
1130         if (end - buffer == 0)
1131                 return -EIO;
1132         return end - buffer;
1133 }
1134
1135 static const struct file_operations proc_fault_inject_operations = {
1136         .read           = proc_fault_inject_read,
1137         .write          = proc_fault_inject_write,
1138 };
1139 #endif
1140
1141
1142 #ifdef CONFIG_SCHED_DEBUG
1143 /*
1144  * Print out various scheduling related per-task fields:
1145  */
1146 static int sched_show(struct seq_file *m, void *v)
1147 {
1148         struct inode *inode = m->private;
1149         struct task_struct *p;
1150
1151         WARN_ON(!inode);
1152
1153         p = get_proc_task(inode);
1154         if (!p)
1155                 return -ESRCH;
1156         proc_sched_show_task(p, m);
1157
1158         put_task_struct(p);
1159
1160         return 0;
1161 }
1162
1163 static ssize_t
1164 sched_write(struct file *file, const char __user *buf,
1165             size_t count, loff_t *offset)
1166 {
1167         struct inode *inode = file->f_path.dentry->d_inode;
1168         struct task_struct *p;
1169
1170         WARN_ON(!inode);
1171
1172         p = get_proc_task(inode);
1173         if (!p)
1174                 return -ESRCH;
1175         proc_sched_set_task(p);
1176
1177         put_task_struct(p);
1178
1179         return count;
1180 }
1181
1182 static int sched_open(struct inode *inode, struct file *filp)
1183 {
1184         int ret;
1185
1186         ret = single_open(filp, sched_show, NULL);
1187         if (!ret) {
1188                 struct seq_file *m = filp->private_data;
1189
1190                 m->private = inode;
1191         }
1192         return ret;
1193 }
1194
1195 static const struct file_operations proc_pid_sched_operations = {
1196         .open           = sched_open,
1197         .read           = seq_read,
1198         .write          = sched_write,
1199         .llseek         = seq_lseek,
1200         .release        = single_release,
1201 };
1202
1203 #endif
1204
1205 /*
1206  * We added or removed a vma mapping the executable. The vmas are only mapped
1207  * during exec and are not mapped with the mmap system call.
1208  * Callers must hold down_write() on the mm's mmap_sem for these
1209  */
1210 void added_exe_file_vma(struct mm_struct *mm)
1211 {
1212         mm->num_exe_file_vmas++;
1213 }
1214
1215 void removed_exe_file_vma(struct mm_struct *mm)
1216 {
1217         mm->num_exe_file_vmas--;
1218         if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
1219                 fput(mm->exe_file);
1220                 mm->exe_file = NULL;
1221         }
1222
1223 }
1224
1225 void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
1226 {
1227         if (new_exe_file)
1228                 get_file(new_exe_file);
1229         if (mm->exe_file)
1230                 fput(mm->exe_file);
1231         mm->exe_file = new_exe_file;
1232         mm->num_exe_file_vmas = 0;
1233 }
1234
1235 struct file *get_mm_exe_file(struct mm_struct *mm)
1236 {
1237         struct file *exe_file;
1238
1239         /* We need mmap_sem to protect against races with removal of
1240          * VM_EXECUTABLE vmas */
1241         down_read(&mm->mmap_sem);
1242         exe_file = mm->exe_file;
1243         if (exe_file)
1244                 get_file(exe_file);
1245         up_read(&mm->mmap_sem);
1246         return exe_file;
1247 }
1248
1249 void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
1250 {
1251         /* It's safe to write the exe_file pointer without exe_file_lock because
1252          * this is called during fork when the task is not yet in /proc */
1253         newmm->exe_file = get_mm_exe_file(oldmm);
1254 }
1255
1256 static int proc_exe_link(struct inode *inode, struct path *exe_path)
1257 {
1258         struct task_struct *task;
1259         struct mm_struct *mm;
1260         struct file *exe_file;
1261
1262         task = get_proc_task(inode);
1263         if (!task)
1264                 return -ENOENT;
1265         mm = get_task_mm(task);
1266         put_task_struct(task);
1267         if (!mm)
1268                 return -ENOENT;
1269         exe_file = get_mm_exe_file(mm);
1270         mmput(mm);
1271         if (exe_file) {
1272                 *exe_path = exe_file->f_path;
1273                 path_get(&exe_file->f_path);
1274                 fput(exe_file);
1275                 return 0;
1276         } else
1277                 return -ENOENT;
1278 }
1279
1280 static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
1281 {
1282         struct inode *inode = dentry->d_inode;
1283         int error = -EACCES;
1284
1285         /* We don't need a base pointer in the /proc filesystem */
1286         path_put(&nd->path);
1287
1288         /* Are we allowed to snoop on the tasks file descriptors? */
1289         if (!proc_fd_access_allowed(inode))
1290                 goto out;
1291
1292         error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
1293         nd->last_type = LAST_BIND;
1294 out:
1295         return ERR_PTR(error);
1296 }
1297
1298 static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
1299 {
1300         char *tmp = (char*)__get_free_page(GFP_TEMPORARY);
1301         char *pathname;
1302         int len;
1303
1304         if (!tmp)
1305                 return -ENOMEM;
1306
1307         pathname = d_path(path, tmp, PAGE_SIZE);
1308         len = PTR_ERR(pathname);
1309         if (IS_ERR(pathname))
1310                 goto out;
1311         len = tmp + PAGE_SIZE - 1 - pathname;
1312
1313         if (len > buflen)
1314                 len = buflen;
1315         if (copy_to_user(buffer, pathname, len))
1316                 len = -EFAULT;
1317  out:
1318         free_page((unsigned long)tmp);
1319         return len;
1320 }
1321
1322 static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
1323 {
1324         int error = -EACCES;
1325         struct inode *inode = dentry->d_inode;
1326         struct path path;
1327
1328         /* Are we allowed to snoop on the tasks file descriptors? */
1329         if (!proc_fd_access_allowed(inode))
1330                 goto out;
1331
1332         error = PROC_I(inode)->op.proc_get_link(inode, &path);
1333         if (error)
1334                 goto out;
1335
1336         error = do_proc_readlink(&path, buffer, buflen);
1337         path_put(&path);
1338 out:
1339         return error;
1340 }
1341
1342 static const struct inode_operations proc_pid_link_inode_operations = {
1343         .readlink       = proc_pid_readlink,
1344         .follow_link    = proc_pid_follow_link,
1345         .setattr        = proc_setattr,
1346 };
1347
1348
1349 /* building an inode */
1350
1351 static int task_dumpable(struct task_struct *task)
1352 {
1353         int dumpable = 0;
1354         struct mm_struct *mm;
1355
1356         task_lock(task);
1357         mm = task->mm;
1358         if (mm)
1359                 dumpable = get_dumpable(mm);
1360         task_unlock(task);
1361         if(dumpable == 1)
1362                 return 1;
1363         return 0;
1364 }
1365
1366
1367 static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
1368 {
1369         struct inode * inode;
1370         struct proc_inode *ei;
1371
1372         /* We need a new inode */
1373
1374         inode = new_inode(sb);
1375         if (!inode)
1376                 goto out;
1377
1378         /* Common stuff */
1379         ei = PROC_I(inode);
1380         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1381         inode->i_op = &proc_def_inode_operations;
1382
1383         /*
1384          * grab the reference to task.
1385          */
1386         ei->pid = get_task_pid(task, PIDTYPE_PID);
1387         if (!ei->pid)
1388                 goto out_unlock;
1389
1390         inode->i_uid = 0;
1391         inode->i_gid = 0;
1392         if (task_dumpable(task)) {
1393                 inode->i_uid = task->euid;
1394                 inode->i_gid = task->egid;
1395         }
1396         security_task_to_inode(task, inode);
1397
1398 out:
1399         return inode;
1400
1401 out_unlock:
1402         iput(inode);
1403         return NULL;
1404 }
1405
1406 static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1407 {
1408         struct inode *inode = dentry->d_inode;
1409         struct task_struct *task;
1410         generic_fillattr(inode, stat);
1411
1412         rcu_read_lock();
1413         stat->uid = 0;
1414         stat->gid = 0;
1415         task = pid_task(proc_pid(inode), PIDTYPE_PID);
1416         if (task) {
1417                 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
1418                     task_dumpable(task)) {
1419                         stat->uid = task->euid;
1420                         stat->gid = task->egid;
1421                 }
1422         }
1423         rcu_read_unlock();
1424         return 0;
1425 }
1426
1427 /* dentry stuff */
1428
1429 /*
1430  *      Exceptional case: normally we are not allowed to unhash a busy
1431  * directory. In this case, however, we can do it - no aliasing problems
1432  * due to the way we treat inodes.
1433  *
1434  * Rewrite the inode's ownerships here because the owning task may have
1435  * performed a setuid(), etc.
1436  *
1437  * Before the /proc/pid/status file was created the only way to read
1438  * the effective uid of a /process was to stat /proc/pid.  Reading
1439  * /proc/pid/status is slow enough that procps and other packages
1440  * kept stating /proc/pid.  To keep the rules in /proc simple I have
1441  * made this apply to all per process world readable and executable
1442  * directories.
1443  */
1444 static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
1445 {
1446         struct inode *inode = dentry->d_inode;
1447         struct task_struct *task = get_proc_task(inode);
1448         if (task) {
1449                 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
1450                     task_dumpable(task)) {
1451                         inode->i_uid = task->euid;
1452                         inode->i_gid = task->egid;
1453                 } else {
1454                         inode->i_uid = 0;
1455                         inode->i_gid = 0;
1456                 }
1457                 inode->i_mode &= ~(S_ISUID | S_ISGID);
1458                 security_task_to_inode(task, inode);
1459                 put_task_struct(task);
1460                 return 1;
1461         }
1462         d_drop(dentry);
1463         return 0;
1464 }
1465
1466 static int pid_delete_dentry(struct dentry * dentry)
1467 {
1468         /* Is the task we represent dead?
1469          * If so, then don't put the dentry on the lru list,
1470          * kill it immediately.
1471          */
1472         return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
1473 }
1474
1475 static struct dentry_operations pid_dentry_operations =
1476 {
1477         .d_revalidate   = pid_revalidate,
1478         .d_delete       = pid_delete_dentry,
1479 };
1480
1481 /* Lookups */
1482
1483 typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
1484                                 struct task_struct *, const void *);
1485
1486 /*
1487  * Fill a directory entry.
1488  *
1489  * If possible create the dcache entry and derive our inode number and
1490  * file type from dcache entry.
1491  *
1492  * Since all of the proc inode numbers are dynamically generated, the inode
1493  * numbers do not exist until the inode is cache.  This means creating the
1494  * the dcache entry in readdir is necessary to keep the inode numbers
1495  * reported by readdir in sync with the inode numbers reported
1496  * by stat.
1497  */
1498 static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
1499         char *name, int len,
1500         instantiate_t instantiate, struct task_struct *task, const void *ptr)
1501 {
1502         struct dentry *child, *dir = filp->f_path.dentry;
1503         struct inode *inode;
1504         struct qstr qname;
1505         ino_t ino = 0;
1506         unsigned type = DT_UNKNOWN;
1507
1508         qname.name = name;
1509         qname.len  = len;
1510         qname.hash = full_name_hash(name, len);
1511
1512         child = d_lookup(dir, &qname);
1513         if (!child) {
1514                 struct dentry *new;
1515                 new = d_alloc(dir, &qname);
1516                 if (new) {
1517                         child = instantiate(dir->d_inode, new, task, ptr);
1518                         if (child)
1519                                 dput(new);
1520                         else
1521                                 child = new;
1522                 }
1523         }
1524         if (!child || IS_ERR(child) || !child->d_inode)
1525                 goto end_instantiate;
1526         inode = child->d_inode;
1527         if (inode) {
1528                 ino = inode->i_ino;
1529                 type = inode->i_mode >> 12;
1530         }
1531         dput(child);
1532 end_instantiate:
1533         if (!ino)
1534                 ino = find_inode_number(dir, &qname);
1535         if (!ino)
1536                 ino = 1;
1537         return filldir(dirent, name, len, filp->f_pos, ino, type);
1538 }
1539
1540 static unsigned name_to_int(struct dentry *dentry)
1541 {
1542         const char *name = dentry->d_name.name;
1543         int len = dentry->d_name.len;
1544         unsigned n = 0;
1545
1546         if (len > 1 && *name == '0')
1547                 goto out;
1548         while (len-- > 0) {
1549                 unsigned c = *name++ - '0';
1550                 if (c > 9)
1551                         goto out;
1552                 if (n >= (~0U-9)/10)
1553                         goto out;
1554                 n *= 10;
1555                 n += c;
1556         }
1557         return n;
1558 out:
1559         return ~0U;
1560 }
1561
1562 #define PROC_FDINFO_MAX 64
1563
1564 static int proc_fd_info(struct inode *inode, struct path *path, char *info)
1565 {
1566         struct task_struct *task = get_proc_task(inode);
1567         struct files_struct *files = NULL;
1568         struct file *file;
1569         int fd = proc_fd(inode);
1570
1571         if (task) {
1572                 files = get_files_struct(task);
1573                 put_task_struct(task);
1574         }
1575         if (files) {
1576                 /*
1577                  * We are not taking a ref to the file structure, so we must
1578                  * hold ->file_lock.
1579                  */
1580                 spin_lock(&files->file_lock);
1581                 file = fcheck_files(files, fd);
1582                 if (file) {
1583                         if (path) {
1584                                 *path = file->f_path;
1585                                 path_get(&file->f_path);
1586                         }
1587                         if (info)
1588                                 snprintf(info, PROC_FDINFO_MAX,
1589                                          "pos:\t%lli\n"
1590                                          "flags:\t0%o\n",
1591                                          (long long) file->f_pos,
1592                                          file->f_flags);
1593                         spin_unlock(&files->file_lock);
1594                         put_files_struct(files);
1595                         return 0;
1596                 }
1597                 spin_unlock(&files->file_lock);
1598                 put_files_struct(files);
1599         }
1600         return -ENOENT;
1601 }
1602
1603 static int proc_fd_link(struct inode *inode, struct path *path)
1604 {
1605         return proc_fd_info(inode, path, NULL);
1606 }
1607
1608 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
1609 {
1610         struct inode *inode = dentry->d_inode;
1611         struct task_struct *task = get_proc_task(inode);
1612         int fd = proc_fd(inode);
1613         struct files_struct *files;
1614
1615         if (task) {
1616                 files = get_files_struct(task);
1617                 if (files) {
1618                         rcu_read_lock();
1619                         if (fcheck_files(files, fd)) {
1620                                 rcu_read_unlock();
1621                                 put_files_struct(files);
1622                                 if (task_dumpable(task)) {
1623                                         inode->i_uid = task->euid;
1624                                         inode->i_gid = task->egid;
1625                                 } else {
1626                                         inode->i_uid = 0;
1627                                         inode->i_gid = 0;
1628                                 }
1629                                 inode->i_mode &= ~(S_ISUID | S_ISGID);
1630                                 security_task_to_inode(task, inode);
1631                                 put_task_struct(task);
1632                                 return 1;
1633                         }
1634                         rcu_read_unlock();
1635                         put_files_struct(files);
1636                 }
1637                 put_task_struct(task);
1638         }
1639         d_drop(dentry);
1640         return 0;
1641 }
1642
1643 static struct dentry_operations tid_fd_dentry_operations =
1644 {
1645         .d_revalidate   = tid_fd_revalidate,
1646         .d_delete       = pid_delete_dentry,
1647 };
1648
1649 static struct dentry *proc_fd_instantiate(struct inode *dir,
1650         struct dentry *dentry, struct task_struct *task, const void *ptr)
1651 {
1652         unsigned fd = *(const unsigned *)ptr;
1653         struct file *file;
1654         struct files_struct *files;
1655         struct inode *inode;
1656         struct proc_inode *ei;
1657         struct dentry *error = ERR_PTR(-ENOENT);
1658
1659         inode = proc_pid_make_inode(dir->i_sb, task);
1660         if (!inode)
1661                 goto out;
1662         ei = PROC_I(inode);
1663         ei->fd = fd;
1664         files = get_files_struct(task);
1665         if (!files)
1666                 goto out_iput;
1667         inode->i_mode = S_IFLNK;
1668
1669         /*
1670          * We are not taking a ref to the file structure, so we must
1671          * hold ->file_lock.
1672          */
1673         spin_lock(&files->file_lock);
1674         file = fcheck_files(files, fd);
1675         if (!file)
1676                 goto out_unlock;
1677         if (file->f_mode & 1)
1678                 inode->i_mode |= S_IRUSR | S_IXUSR;
1679         if (file->f_mode & 2)
1680                 inode->i_mode |= S_IWUSR | S_IXUSR;
1681         spin_unlock(&files->file_lock);
1682         put_files_struct(files);
1683
1684         inode->i_op = &proc_pid_link_inode_operations;
1685         inode->i_size = 64;
1686         ei->op.proc_get_link = proc_fd_link;
1687         dentry->d_op = &tid_fd_dentry_operations;
1688         d_add(dentry, inode);
1689         /* Close the race of the process dying before we return the dentry */
1690         if (tid_fd_revalidate(dentry, NULL))
1691                 error = NULL;
1692
1693  out:
1694         return error;
1695 out_unlock:
1696         spin_unlock(&files->file_lock);
1697         put_files_struct(files);
1698 out_iput:
1699         iput(inode);
1700         goto out;
1701 }
1702
1703 static struct dentry *proc_lookupfd_common(struct inode *dir,
1704                                            struct dentry *dentry,
1705                                            instantiate_t instantiate)
1706 {
1707         struct task_struct *task = get_proc_task(dir);
1708         unsigned fd = name_to_int(dentry);
1709         struct dentry *result = ERR_PTR(-ENOENT);
1710
1711         if (!task)
1712                 goto out_no_task;
1713         if (fd == ~0U)
1714                 goto out;
1715
1716         result = instantiate(dir, dentry, task, &fd);
1717 out:
1718         put_task_struct(task);
1719 out_no_task:
1720         return result;
1721 }
1722
1723 static int proc_readfd_common(struct file * filp, void * dirent,
1724                               filldir_t filldir, instantiate_t instantiate)
1725 {
1726         struct dentry *dentry = filp->f_path.dentry;
1727         struct inode *inode = dentry->d_inode;
1728         struct task_struct *p = get_proc_task(inode);
1729         unsigned int fd, ino;
1730         int retval;
1731         struct files_struct * files;
1732
1733         retval = -ENOENT;
1734         if (!p)
1735                 goto out_no_task;
1736         retval = 0;
1737
1738         fd = filp->f_pos;
1739         switch (fd) {
1740                 case 0:
1741                         if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
1742                                 goto out;
1743                         filp->f_pos++;
1744                 case 1:
1745                         ino = parent_ino(dentry);
1746                         if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
1747                                 goto out;
1748                         filp->f_pos++;
1749                 default:
1750                         files = get_files_struct(p);
1751                         if (!files)
1752                                 goto out;
1753                         rcu_read_lock();
1754                         for (fd = filp->f_pos-2;
1755                              fd < files_fdtable(files)->max_fds;
1756                              fd++, filp->f_pos++) {
1757                                 char name[PROC_NUMBUF];
1758                                 int len;
1759
1760                                 if (!fcheck_files(files, fd))
1761                                         continue;
1762                                 rcu_read_unlock();
1763
1764                                 len = snprintf(name, sizeof(name), "%d", fd);
1765                                 if (proc_fill_cache(filp, dirent, filldir,
1766                                                     name, len, instantiate,
1767                                                     p, &fd) < 0) {
1768                                         rcu_read_lock();
1769                                         break;
1770                                 }
1771                                 rcu_read_lock();
1772                         }
1773                         rcu_read_unlock();
1774                         put_files_struct(files);
1775         }
1776 out:
1777         put_task_struct(p);
1778 out_no_task:
1779         return retval;
1780 }
1781
1782 static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
1783                                     struct nameidata *nd)
1784 {
1785         return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
1786 }
1787
1788 static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
1789 {
1790         return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
1791 }
1792
1793 static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
1794                                       size_t len, loff_t *ppos)
1795 {
1796         char tmp[PROC_FDINFO_MAX];
1797         int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp);
1798         if (!err)
1799                 err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp));
1800         return err;
1801 }
1802
1803 static const struct file_operations proc_fdinfo_file_operations = {
1804         .open           = nonseekable_open,
1805         .read           = proc_fdinfo_read,
1806 };
1807
1808 static const struct file_operations proc_fd_operations = {
1809         .read           = generic_read_dir,
1810         .readdir        = proc_readfd,
1811 };
1812
1813 /*
1814  * /proc/pid/fd needs a special permission handler so that a process can still
1815  * access /proc/self/fd after it has executed a setuid().
1816  */
1817 static int proc_fd_permission(struct inode *inode, int mask,
1818                                 struct nameidata *nd)
1819 {
1820         int rv;
1821
1822         rv = generic_permission(inode, mask, NULL);
1823         if (rv == 0)
1824                 return 0;
1825         if (task_pid(current) == proc_pid(inode))
1826                 rv = 0;
1827         return rv;
1828 }
1829
1830 /*
1831  * proc directories can do almost nothing..
1832  */
1833 static const struct inode_operations proc_fd_inode_operations = {
1834         .lookup         = proc_lookupfd,
1835         .permission     = proc_fd_permission,
1836         .setattr        = proc_setattr,
1837 };
1838
1839 static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
1840         struct dentry *dentry, struct task_struct *task, const void *ptr)
1841 {
1842         unsigned fd = *(unsigned *)ptr;
1843         struct inode *inode;
1844         struct proc_inode *ei;
1845         struct dentry *error = ERR_PTR(-ENOENT);
1846
1847         inode = proc_pid_make_inode(dir->i_sb, task);
1848         if (!inode)
1849                 goto out;
1850         ei = PROC_I(inode);
1851         ei->fd = fd;
1852         inode->i_mode = S_IFREG | S_IRUSR;
1853         inode->i_fop = &proc_fdinfo_file_operations;
1854         dentry->d_op = &tid_fd_dentry_operations;
1855         d_add(dentry, inode);
1856         /* Close the race of the process dying before we return the dentry */
1857         if (tid_fd_revalidate(dentry, NULL))
1858                 error = NULL;
1859
1860  out:
1861         return error;
1862 }
1863
1864 static struct dentry *proc_lookupfdinfo(struct inode *dir,
1865                                         struct dentry *dentry,
1866                                         struct nameidata *nd)
1867 {
1868         return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
1869 }
1870
1871 static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
1872 {
1873         return proc_readfd_common(filp, dirent, filldir,
1874                                   proc_fdinfo_instantiate);
1875 }
1876
1877 static const struct file_operations proc_fdinfo_operations = {
1878         .read           = generic_read_dir,
1879         .readdir        = proc_readfdinfo,
1880 };
1881
1882 /*
1883  * proc directories can do almost nothing..
1884  */
1885 static const struct inode_operations proc_fdinfo_inode_operations = {
1886         .lookup         = proc_lookupfdinfo,
1887         .setattr        = proc_setattr,
1888 };
1889
1890
1891 static struct dentry *proc_pident_instantiate(struct inode *dir,
1892         struct dentry *dentry, struct task_struct *task, const void *ptr)
1893 {
1894         const struct pid_entry *p = ptr;
1895         struct inode *inode;
1896         struct proc_inode *ei;
1897         struct dentry *error = ERR_PTR(-EINVAL);
1898
1899         inode = proc_pid_make_inode(dir->i_sb, task);
1900         if (!inode)
1901                 goto out;
1902
1903         ei = PROC_I(inode);
1904         inode->i_mode = p->mode;
1905         if (S_ISDIR(inode->i_mode))
1906                 inode->i_nlink = 2;     /* Use getattr to fix if necessary */
1907         if (p->iop)
1908                 inode->i_op = p->iop;
1909         if (p->fop)
1910                 inode->i_fop = p->fop;
1911         ei->op = p->op;
1912         dentry->d_op = &pid_dentry_operations;
1913         d_add(dentry, inode);
1914         /* Close the race of the process dying before we return the dentry */
1915         if (pid_revalidate(dentry, NULL))
1916                 error = NULL;
1917 out:
1918         return error;
1919 }
1920
1921 static struct dentry *proc_pident_lookup(struct inode *dir,
1922                                          struct dentry *dentry,
1923                                          const struct pid_entry *ents,
1924                                          unsigned int nents)
1925 {
1926         struct inode *inode;
1927         struct dentry *error;
1928         struct task_struct *task = get_proc_task(dir);
1929         const struct pid_entry *p, *last;
1930
1931         error = ERR_PTR(-ENOENT);
1932         inode = NULL;
1933
1934         if (!task)
1935                 goto out_no_task;
1936
1937         /*
1938          * Yes, it does not scale. And it should not. Don't add
1939          * new entries into /proc/<tgid>/ without very good reasons.
1940          */
1941         last = &ents[nents - 1];
1942         for (p = ents; p <= last; p++) {
1943                 if (p->len != dentry->d_name.len)
1944                         continue;
1945                 if (!memcmp(dentry->d_name.name, p->name, p->len))
1946                         break;
1947         }
1948         if (p > last)
1949                 goto out;
1950
1951         error = proc_pident_instantiate(dir, dentry, task, p);
1952 out:
1953         put_task_struct(task);
1954 out_no_task:
1955         return error;
1956 }
1957
1958 static int proc_pident_fill_cache(struct file *filp, void *dirent,
1959         filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
1960 {
1961         return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
1962                                 proc_pident_instantiate, task, p);
1963 }
1964
1965 static int proc_pident_readdir(struct file *filp,
1966                 void *dirent, filldir_t filldir,
1967                 const struct pid_entry *ents, unsigned int nents)
1968 {
1969         int i;
1970         struct dentry *dentry = filp->f_path.dentry;
1971         struct inode *inode = dentry->d_inode;
1972         struct task_struct *task = get_proc_task(inode);
1973         const struct pid_entry *p, *last;
1974         ino_t ino;
1975         int ret;
1976
1977         ret = -ENOENT;
1978         if (!task)
1979                 goto out_no_task;
1980
1981         ret = 0;
1982         i = filp->f_pos;
1983         switch (i) {
1984         case 0:
1985                 ino = inode->i_ino;
1986                 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
1987                         goto out;
1988                 i++;
1989                 filp->f_pos++;
1990                 /* fall through */
1991         case 1:
1992                 ino = parent_ino(dentry);
1993                 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
1994                         goto out;
1995                 i++;
1996                 filp->f_pos++;
1997                 /* fall through */
1998         default:
1999                 i -= 2;
2000                 if (i >= nents) {
2001                         ret = 1;
2002                         goto out;
2003                 }
2004                 p = ents + i;
2005                 last = &ents[nents - 1];
2006                 while (p <= last) {
2007                         if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0)
2008                                 goto out;
2009                         filp->f_pos++;
2010                         p++;
2011                 }
2012         }
2013
2014         ret = 1;
2015 out:
2016         put_task_struct(task);
2017 out_no_task:
2018         return ret;
2019 }
2020
2021 #ifdef CONFIG_SECURITY
2022 static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
2023                                   size_t count, loff_t *ppos)
2024 {
2025         struct inode * inode = file->f_path.dentry->d_inode;
2026         char *p = NULL;
2027         ssize_t length;
2028         struct task_struct *task = get_proc_task(inode);
2029
2030         if (!task)
2031                 return -ESRCH;
2032
2033         length = security_getprocattr(task,
2034                                       (char*)file->f_path.dentry->d_name.name,
2035                                       &p);
2036         put_task_struct(task);
2037         if (length > 0)
2038                 length = simple_read_from_buffer(buf, count, ppos, p, length);
2039         kfree(p);
2040         return length;
2041 }
2042
2043 static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
2044                                    size_t count, loff_t *ppos)
2045 {
2046         struct inode * inode = file->f_path.dentry->d_inode;
2047         char *page;
2048         ssize_t length;
2049         struct task_struct *task = get_proc_task(inode);
2050
2051         length = -ESRCH;
2052         if (!task)
2053                 goto out_no_task;
2054         if (count > PAGE_SIZE)
2055                 count = PAGE_SIZE;
2056
2057         /* No partial writes. */
2058         length = -EINVAL;
2059         if (*ppos != 0)
2060                 goto out;
2061
2062         length = -ENOMEM;
2063         page = (char*)__get_free_page(GFP_TEMPORARY);
2064         if (!page)
2065                 goto out;
2066
2067         length = -EFAULT;
2068         if (copy_from_user(page, buf, count))
2069                 goto out_free;
2070
2071         length = security_setprocattr(task,
2072                                       (char*)file->f_path.dentry->d_name.name,
2073                                       (void*)page, count);
2074 out_free:
2075         free_page((unsigned long) page);
2076 out:
2077         put_task_struct(task);
2078 out_no_task:
2079         return length;
2080 }
2081
2082 static const struct file_operations proc_pid_attr_operations = {
2083         .read           = proc_pid_attr_read,
2084         .write          = proc_pid_attr_write,
2085 };
2086
2087 static const struct pid_entry attr_dir_stuff[] = {
2088         REG("current",    S_IRUGO|S_IWUGO, pid_attr),
2089         REG("prev",       S_IRUGO,         pid_attr),
2090         REG("exec",       S_IRUGO|S_IWUGO, pid_attr),
2091         REG("fscreate",   S_IRUGO|S_IWUGO, pid_attr),
2092         REG("keycreate",  S_IRUGO|S_IWUGO, pid_attr),
2093         REG("sockcreate", S_IRUGO|S_IWUGO, pid_attr),
2094 };
2095
2096 static int proc_attr_dir_readdir(struct file * filp,
2097                              void * dirent, filldir_t filldir)
2098 {
2099         return proc_pident_readdir(filp,dirent,filldir,
2100                                    attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff));
2101 }
2102
2103 static const struct file_operations proc_attr_dir_operations = {
2104         .read           = generic_read_dir,
2105         .readdir        = proc_attr_dir_readdir,
2106 };
2107
2108 static struct dentry *proc_attr_dir_lookup(struct inode *dir,
2109                                 struct dentry *dentry, struct nameidata *nd)
2110 {
2111         return proc_pident_lookup(dir, dentry,
2112                                   attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
2113 }
2114
2115 static const struct inode_operations proc_attr_dir_inode_operations = {
2116         .lookup         = proc_attr_dir_lookup,
2117         .getattr        = pid_getattr,
2118         .setattr        = proc_setattr,
2119 };
2120
2121 #endif
2122
2123 #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
2124 static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
2125                                          size_t count, loff_t *ppos)
2126 {
2127         struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
2128         struct mm_struct *mm;
2129         char buffer[PROC_NUMBUF];
2130         size_t len;
2131         int ret;
2132
2133         if (!task)
2134                 return -ESRCH;
2135
2136         ret = 0;
2137         mm = get_task_mm(task);
2138         if (mm) {
2139                 len = snprintf(buffer, sizeof(buffer), "%08lx\n",
2140                                ((mm->flags & MMF_DUMP_FILTER_MASK) >>
2141                                 MMF_DUMP_FILTER_SHIFT));
2142                 mmput(mm);
2143                 ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
2144         }
2145
2146         put_task_struct(task);
2147
2148         return ret;
2149 }
2150
2151 static ssize_t proc_coredump_filter_write(struct file *file,
2152                                           const char __user *buf,
2153                                           size_t count,
2154                                           loff_t *ppos)
2155 {
2156         struct task_struct *task;
2157         struct mm_struct *mm;
2158         char buffer[PROC_NUMBUF], *end;
2159         unsigned int val;
2160         int ret;
2161         int i;
2162         unsigned long mask;
2163
2164         ret = -EFAULT;
2165         memset(buffer, 0, sizeof(buffer));
2166         if (count > sizeof(buffer) - 1)
2167                 count = sizeof(buffer) - 1;
2168         if (copy_from_user(buffer, buf, count))
2169                 goto out_no_task;
2170
2171         ret = -EINVAL;
2172         val = (unsigned int)simple_strtoul(buffer, &end, 0);
2173         if (*end == '\n')
2174                 end++;
2175         if (end - buffer == 0)
2176                 goto out_no_task;
2177
2178         ret = -ESRCH;
2179         task = get_proc_task(file->f_dentry->d_inode);
2180         if (!task)
2181                 goto out_no_task;
2182
2183         ret = end - buffer;
2184         mm = get_task_mm(task);
2185         if (!mm)
2186                 goto out_no_mm;
2187
2188         for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
2189                 if (val & mask)
2190                         set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
2191                 else
2192                         clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
2193         }
2194
2195         mmput(mm);
2196  out_no_mm:
2197         put_task_struct(task);
2198  out_no_task:
2199         return ret;
2200 }
2201
2202 static const struct file_operations proc_coredump_filter_operations = {
2203         .read           = proc_coredump_filter_read,
2204         .write          = proc_coredump_filter_write,
2205 };
2206 #endif
2207
2208 /*
2209  * /proc/self:
2210  */
2211 static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
2212                               int buflen)
2213 {
2214         struct pid_namespace *ns = dentry->d_sb->s_fs_info;
2215         pid_t tgid = task_tgid_nr_ns(current, ns);
2216         char tmp[PROC_NUMBUF];
2217         if (!tgid)
2218                 return -ENOENT;
2219         sprintf(tmp, "%d", tgid);
2220         return vfs_readlink(dentry,buffer,buflen,tmp);
2221 }
2222
2223 static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
2224 {
2225         struct pid_namespace *ns = dentry->d_sb->s_fs_info;
2226         pid_t tgid = task_tgid_nr_ns(current, ns);
2227         char tmp[PROC_NUMBUF];
2228         if (!tgid)
2229                 return ERR_PTR(-ENOENT);
2230         sprintf(tmp, "%d", task_tgid_nr_ns(current, ns));
2231         return ERR_PTR(vfs_follow_link(nd,tmp));
2232 }
2233
2234 static const struct inode_operations proc_self_inode_operations = {
2235         .readlink       = proc_self_readlink,
2236         .follow_link    = proc_self_follow_link,
2237 };
2238
2239 /*
2240  * proc base
2241  *
2242  * These are the directory entries in the root directory of /proc
2243  * that properly belong to the /proc filesystem, as they describe
2244  * describe something that is process related.
2245  */
2246 static const struct pid_entry proc_base_stuff[] = {
2247         NOD("self", S_IFLNK|S_IRWXUGO,
2248                 &proc_self_inode_operations, NULL, {}),
2249 };
2250
2251 /*
2252  *      Exceptional case: normally we are not allowed to unhash a busy
2253  * directory. In this case, however, we can do it - no aliasing problems
2254  * due to the way we treat inodes.
2255  */
2256 static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
2257 {
2258         struct inode *inode = dentry->d_inode;
2259         struct task_struct *task = get_proc_task(inode);
2260         if (task) {
2261                 put_task_struct(task);
2262                 return 1;
2263         }
2264         d_drop(dentry);
2265         return 0;
2266 }
2267
2268 static struct dentry_operations proc_base_dentry_operations =
2269 {
2270         .d_revalidate   = proc_base_revalidate,
2271         .d_delete       = pid_delete_dentry,
2272 };
2273
2274 static struct dentry *proc_base_instantiate(struct inode *dir,
2275         struct dentry *dentry, struct task_struct *task, const void *ptr)
2276 {
2277         const struct pid_entry *p = ptr;
2278         struct inode *inode;
2279         struct proc_inode *ei;
2280         struct dentry *error = ERR_PTR(-EINVAL);
2281
2282         /* Allocate the inode */
2283         error = ERR_PTR(-ENOMEM);
2284         inode = new_inode(dir->i_sb);
2285         if (!inode)
2286                 goto out;
2287
2288         /* Initialize the inode */
2289         ei = PROC_I(inode);
2290         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
2291
2292         /*
2293          * grab the reference to the task.
2294          */
2295         ei->pid = get_task_pid(task, PIDTYPE_PID);
2296         if (!ei->pid)
2297                 goto out_iput;
2298
2299         inode->i_uid = 0;
2300         inode->i_gid = 0;
2301         inode->i_mode = p->mode;
2302         if (S_ISDIR(inode->i_mode))
2303                 inode->i_nlink = 2;
2304         if (S_ISLNK(inode->i_mode))
2305                 inode->i_size = 64;
2306         if (p->iop)
2307                 inode->i_op = p->iop;
2308         if (p->fop)
2309                 inode->i_fop = p->fop;
2310         ei->op = p->op;
2311         dentry->d_op = &proc_base_dentry_operations;
2312         d_add(dentry, inode);
2313         error = NULL;
2314 out:
2315         return error;
2316 out_iput:
2317         iput(inode);
2318         goto out;
2319 }
2320
2321 static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry)
2322 {
2323         struct dentry *error;
2324         struct task_struct *task = get_proc_task(dir);
2325         const struct pid_entry *p, *last;
2326
2327         error = ERR_PTR(-ENOENT);
2328
2329         if (!task)
2330                 goto out_no_task;
2331
2332         /* Lookup the directory entry */
2333         last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1];
2334         for (p = proc_base_stuff; p <= last; p++) {
2335                 if (p->len != dentry->d_name.len)
2336                         continue;
2337                 if (!memcmp(dentry->d_name.name, p->name, p->len))
2338                         break;
2339         }
2340         if (p > last)
2341                 goto out;
2342
2343         error = proc_base_instantiate(dir, dentry, task, p);
2344
2345 out:
2346         put_task_struct(task);
2347 out_no_task:
2348         return error;
2349 }
2350
2351 static int proc_base_fill_cache(struct file *filp, void *dirent,
2352         filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
2353 {
2354         return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
2355                                 proc_base_instantiate, task, p);
2356 }
2357
2358 #ifdef CONFIG_TASK_IO_ACCOUNTING
2359 static int proc_pid_io_accounting(struct task_struct *task, char *buffer)
2360 {
2361         return sprintf(buffer,
2362 #ifdef CONFIG_TASK_XACCT
2363                         "rchar: %llu\n"
2364                         "wchar: %llu\n"
2365                         "syscr: %llu\n"
2366                         "syscw: %llu\n"
2367 #endif
2368                         "read_bytes: %llu\n"
2369                         "write_bytes: %llu\n"
2370                         "cancelled_write_bytes: %llu\n",
2371 #ifdef CONFIG_TASK_XACCT
2372                         (unsigned long long)task->rchar,
2373                         (unsigned long long)task->wchar,
2374                         (unsigned long long)task->syscr,
2375                         (unsigned long long)task->syscw,
2376 #endif
2377                         (unsigned long long)task->ioac.read_bytes,
2378                         (unsigned long long)task->ioac.write_bytes,
2379                         (unsigned long long)task->ioac.cancelled_write_bytes);
2380 }
2381 #endif
2382
2383 /*
2384  * Thread groups
2385  */
2386 static const struct file_operations proc_task_operations;
2387 static const struct inode_operations proc_task_inode_operations;
2388
2389 static const struct pid_entry tgid_base_stuff[] = {
2390         DIR("task",       S_IRUGO|S_IXUGO, task),
2391         DIR("fd",         S_IRUSR|S_IXUSR, fd),
2392         DIR("fdinfo",     S_IRUSR|S_IXUSR, fdinfo),
2393 #ifdef CONFIG_NET
2394         DIR("net",        S_IRUGO|S_IXUGO, net),
2395 #endif
2396         REG("environ",    S_IRUSR, environ),
2397         INF("auxv",       S_IRUSR, pid_auxv),
2398         ONE("status",     S_IRUGO, pid_status),
2399         INF("limits",     S_IRUSR, pid_limits),
2400 #ifdef CONFIG_SCHED_DEBUG
2401         REG("sched",      S_IRUGO|S_IWUSR, pid_sched),
2402 #endif
2403         INF("cmdline",    S_IRUGO, pid_cmdline),
2404         ONE("stat",       S_IRUGO, tgid_stat),
2405         ONE("statm",      S_IRUGO, pid_statm),
2406         REG("maps",       S_IRUGO, maps),
2407 #ifdef CONFIG_NUMA
2408         REG("numa_maps",  S_IRUGO, numa_maps),
2409 #endif
2410         REG("mem",        S_IRUSR|S_IWUSR, mem),
2411         LNK("cwd",        cwd),
2412         LNK("root",       root),
2413         LNK("exe",        exe),
2414         REG("mounts",     S_IRUGO, mounts),
2415         REG("mountinfo",  S_IRUGO, mountinfo),
2416         REG("mountstats", S_IRUSR, mountstats),
2417 #ifdef CONFIG_PROC_PAGE_MONITOR
2418         REG("clear_refs", S_IWUSR, clear_refs),
2419         REG("smaps",      S_IRUGO, smaps),
2420         REG("pagemap",    S_IRUSR, pagemap),
2421 #endif
2422 #ifdef CONFIG_SECURITY
2423         DIR("attr",       S_IRUGO|S_IXUGO, attr_dir),
2424 #endif
2425 #ifdef CONFIG_KALLSYMS
2426         INF("wchan",      S_IRUGO, pid_wchan),
2427 #endif
2428 #ifdef CONFIG_SCHEDSTATS
2429         INF("schedstat",  S_IRUGO, pid_schedstat),
2430 #endif
2431 #ifdef CONFIG_LATENCYTOP
2432         REG("latency",  S_IRUGO, lstats),
2433 #endif
2434 #ifdef CONFIG_PROC_PID_CPUSET
2435         REG("cpuset",     S_IRUGO, cpuset),
2436 #endif
2437 #ifdef CONFIG_CGROUPS
2438         REG("cgroup",  S_IRUGO, cgroup),
2439 #endif
2440         INF("oom_score",  S_IRUGO, oom_score),
2441         REG("oom_adj",    S_IRUGO|S_IWUSR, oom_adjust),
2442 #ifdef CONFIG_AUDITSYSCALL
2443         REG("loginuid",   S_IWUSR|S_IRUGO, loginuid),
2444         REG("sessionid",  S_IRUGO, sessionid),
2445 #endif
2446 #ifdef CONFIG_FAULT_INJECTION
2447         REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
2448 #endif
2449 #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
2450         REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter),
2451 #endif
2452 #ifdef CONFIG_TASK_IO_ACCOUNTING
2453         INF("io",       S_IRUGO, pid_io_accounting),
2454 #endif
2455 };
2456
2457 static int proc_tgid_base_readdir(struct file * filp,
2458                              void * dirent, filldir_t filldir)
2459 {
2460         return proc_pident_readdir(filp,dirent,filldir,
2461                                    tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff));
2462 }
2463
2464 static const struct file_operations proc_tgid_base_operations = {
2465         .read           = generic_read_dir,
2466         .readdir        = proc_tgid_base_readdir,
2467 };
2468
2469 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
2470         return proc_pident_lookup(dir, dentry,
2471                                   tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
2472 }
2473
2474 static const struct inode_operations proc_tgid_base_inode_operations = {
2475         .lookup         = proc_tgid_base_lookup,
2476         .getattr        = pid_getattr,
2477         .setattr        = proc_setattr,
2478 };
2479
2480 static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
2481 {
2482         struct dentry *dentry, *leader, *dir;
2483         char buf[PROC_NUMBUF];
2484         struct qstr name;
2485
2486         name.name = buf;
2487         name.len = snprintf(buf, sizeof(buf), "%d", pid);
2488         dentry = d_hash_and_lookup(mnt->mnt_root, &name);
2489         if (dentry) {
2490                 if (!(current->flags & PF_EXITING))
2491                         shrink_dcache_parent(dentry);
2492                 d_drop(dentry);
2493                 dput(dentry);
2494         }
2495
2496         if (tgid == 0)
2497                 goto out;
2498
2499         name.name = buf;
2500         name.len = snprintf(buf, sizeof(buf), "%d", tgid);
2501         leader = d_hash_and_lookup(mnt->mnt_root, &name);
2502         if (!leader)
2503                 goto out;
2504
2505         name.name = "task";
2506         name.len = strlen(name.name);
2507         dir = d_hash_and_lookup(leader, &name);
2508         if (!dir)
2509                 goto out_put_leader;
2510
2511         name.name = buf;
2512         name.len = snprintf(buf, sizeof(buf), "%d", pid);
2513         dentry = d_hash_and_lookup(dir, &name);
2514         if (dentry) {
2515                 shrink_dcache_parent(dentry);
2516                 d_drop(dentry);
2517                 dput(dentry);
2518         }
2519
2520         dput(dir);
2521 out_put_leader:
2522         dput(leader);
2523 out:
2524         return;
2525 }
2526
2527 /**
2528  * proc_flush_task -  Remove dcache entries for @task from the /proc dcache.
2529  * @task: task that should be flushed.
2530  *
2531  * When flushing dentries from proc, one needs to flush them from global
2532  * proc (proc_mnt) and from all the namespaces' procs this task was seen
2533  * in. This call is supposed to do all of this job.
2534  *
2535  * Looks in the dcache for
2536  * /proc/@pid
2537  * /proc/@tgid/task/@pid
2538  * if either directory is present flushes it and all of it'ts children
2539  * from the dcache.
2540  *
2541  * It is safe and reasonable to cache /proc entries for a task until
2542  * that task exits.  After that they just clog up the dcache with
2543  * useless entries, possibly causing useful dcache entries to be
2544  * flushed instead.  This routine is proved to flush those useless
2545  * dcache entries at process exit time.
2546  *
2547  * NOTE: This routine is just an optimization so it does not guarantee
2548  *       that no dcache entries will exist at process exit time it
2549  *       just makes it very unlikely that any will persist.
2550  */
2551
2552 void proc_flush_task(struct task_struct *task)
2553 {
2554         int i;
2555         struct pid *pid, *tgid = NULL;
2556         struct upid *upid;
2557
2558         pid = task_pid(task);
2559         if (thread_group_leader(task))
2560                 tgid = task_tgid(task);
2561
2562         for (i = 0; i <= pid->level; i++) {
2563                 upid = &pid->numbers[i];
2564                 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
2565                         tgid ? tgid->numbers[i].nr : 0);
2566         }
2567
2568         upid = &pid->numbers[pid->level];
2569         if (upid->nr == 1)
2570                 pid_ns_release_proc(upid->ns);
2571 }
2572
2573 static struct dentry *proc_pid_instantiate(struct inode *dir,
2574                                            struct dentry * dentry,
2575                                            struct task_struct *task, const void *ptr)
2576 {
2577         struct dentry *error = ERR_PTR(-ENOENT);
2578         struct inode *inode;
2579
2580         inode = proc_pid_make_inode(dir->i_sb, task);
2581         if (!inode)
2582                 goto out;
2583
2584         inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
2585         inode->i_op = &proc_tgid_base_inode_operations;
2586         inode->i_fop = &proc_tgid_base_operations;
2587         inode->i_flags|=S_IMMUTABLE;
2588         inode->i_nlink = 5;
2589 #ifdef CONFIG_SECURITY
2590         inode->i_nlink += 1;
2591 #endif
2592
2593         dentry->d_op = &pid_dentry_operations;
2594
2595         d_add(dentry, inode);
2596         /* Close the race of the process dying before we return the dentry */
2597         if (pid_revalidate(dentry, NULL))
2598                 error = NULL;
2599 out:
2600         return error;
2601 }
2602
2603 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
2604 {
2605         struct dentry *result = ERR_PTR(-ENOENT);
2606         struct task_struct *task;
2607         unsigned tgid;
2608         struct pid_namespace *ns;
2609
2610         result = proc_base_lookup(dir, dentry);
2611         if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT)
2612                 goto out;
2613
2614         tgid = name_to_int(dentry);
2615         if (tgid == ~0U)
2616                 goto out;
2617
2618         ns = dentry->d_sb->s_fs_info;
2619         rcu_read_lock();
2620         task = find_task_by_pid_ns(tgid, ns);
2621         if (task)
2622                 get_task_struct(task);
2623         rcu_read_unlock();
2624         if (!task)
2625                 goto out;
2626
2627         result = proc_pid_instantiate(dir, dentry, task, NULL);
2628         put_task_struct(task);
2629 out:
2630         return result;
2631 }
2632
2633 /*
2634  * Find the first task with tgid >= tgid
2635  *
2636  */
2637 struct tgid_iter {
2638         unsigned int tgid;
2639         struct task_struct *task;
2640 };
2641 static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
2642 {
2643         struct pid *pid;
2644
2645         if (iter.task)
2646                 put_task_struct(iter.task);
2647         rcu_read_lock();
2648 retry:
2649         iter.task = NULL;
2650         pid = find_ge_pid(iter.tgid, ns);
2651         if (pid) {
2652                 iter.tgid = pid_nr_ns(pid, ns);
2653                 iter.task = pid_task(pid, PIDTYPE_PID);
2654                 /* What we to know is if the pid we have find is the
2655                  * pid of a thread_group_leader.  Testing for task
2656                  * being a thread_group_leader is the obvious thing
2657                  * todo but there is a window when it fails, due to
2658                  * the pid transfer logic in de_thread.
2659                  *
2660                  * So we perform the straight forward test of seeing
2661                  * if the pid we have found is the pid of a thread
2662                  * group leader, and don't worry if the task we have
2663                  * found doesn't happen to be a thread group leader.
2664                  * As we don't care in the case of readdir.
2665                  */
2666                 if (!iter.task || !has_group_leader_pid(iter.task)) {
2667                         iter.tgid += 1;
2668                         goto retry;
2669                 }
2670                 get_task_struct(iter.task);
2671         }
2672         rcu_read_unlock();
2673         return iter;
2674 }
2675
2676 #define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff))
2677
2678 static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
2679         struct tgid_iter iter)
2680 {
2681         char name[PROC_NUMBUF];
2682         int len = snprintf(name, sizeof(name), "%d", iter.tgid);
2683         return proc_fill_cache(filp, dirent, filldir, name, len,
2684                                 proc_pid_instantiate, iter.task, NULL);
2685 }
2686
2687 /* for the /proc/ directory itself, after non-process stuff has been done */
2688 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
2689 {
2690         unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
2691         struct task_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode);
2692         struct tgid_iter iter;
2693         struct pid_namespace *ns;
2694
2695         if (!reaper)
2696                 goto out_no_task;
2697
2698         for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
2699                 const struct pid_entry *p = &proc_base_stuff[nr];
2700                 if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
2701                         goto out;
2702         }
2703
2704         ns = filp->f_dentry->d_sb->s_fs_info;
2705         iter.task = NULL;
2706         iter.tgid = filp->f_pos - TGID_OFFSET;
2707         for (iter = next_tgid(ns, iter);
2708              iter.task;
2709              iter.tgid += 1, iter = next_tgid(ns, iter)) {
2710                 filp->f_pos = iter.tgid + TGID_OFFSET;
2711                 if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) {
2712                         put_task_struct(iter.task);
2713                         goto out;
2714                 }
2715         }
2716         filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
2717 out:
2718         put_task_struct(reaper);
2719 out_no_task:
2720         return 0;
2721 }
2722
2723 /*
2724  * Tasks
2725  */
2726 static const struct pid_entry tid_base_stuff[] = {
2727         DIR("fd",        S_IRUSR|S_IXUSR, fd),
2728         DIR("fdinfo",    S_IRUSR|S_IXUSR, fdinfo),
2729         REG("environ",   S_IRUSR, environ),
2730         INF("auxv",      S_IRUSR, pid_auxv),
2731         ONE("status",    S_IRUGO, pid_status),
2732         INF("limits",    S_IRUSR, pid_limits),
2733 #ifdef CONFIG_SCHED_DEBUG
2734         REG("sched",     S_IRUGO|S_IWUSR, pid_sched),
2735 #endif
2736         INF("cmdline",   S_IRUGO, pid_cmdline),
2737         ONE("stat",      S_IRUGO, tid_stat),
2738         ONE("statm",     S_IRUGO, pid_statm),
2739         REG("maps",      S_IRUGO, maps),
2740 #ifdef CONFIG_NUMA
2741         REG("numa_maps", S_IRUGO, numa_maps),
2742 #endif
2743         REG("mem",       S_IRUSR|S_IWUSR, mem),
2744         LNK("cwd",       cwd),
2745         LNK("root",      root),
2746         LNK("exe",       exe),
2747         REG("mounts",    S_IRUGO, mounts),
2748         REG("mountinfo",  S_IRUGO, mountinfo),
2749 #ifdef CONFIG_PROC_PAGE_MONITOR
2750         REG("clear_refs", S_IWUSR, clear_refs),
2751         REG("smaps",     S_IRUGO, smaps),
2752         REG("pagemap",    S_IRUSR, pagemap),
2753 #endif
2754 #ifdef CONFIG_SECURITY
2755         DIR("attr",      S_IRUGO|S_IXUGO, attr_dir),
2756 #endif
2757 #ifdef CONFIG_KALLSYMS
2758         INF("wchan",     S_IRUGO, pid_wchan),
2759 #endif
2760 #ifdef CONFIG_SCHEDSTATS
2761         INF("schedstat", S_IRUGO, pid_schedstat),
2762 #endif
2763 #ifdef CONFIG_LATENCYTOP
2764         REG("latency",  S_IRUGO, lstats),
2765 #endif
2766 #ifdef CONFIG_PROC_PID_CPUSET
2767         REG("cpuset",    S_IRUGO, cpuset),
2768 #endif
2769 #ifdef CONFIG_CGROUPS
2770         REG("cgroup",  S_IRUGO, cgroup),
2771 #endif
2772         INF("oom_score", S_IRUGO, oom_score),
2773         REG("oom_adj",   S_IRUGO|S_IWUSR, oom_adjust),
2774 #ifdef CONFIG_AUDITSYSCALL
2775         REG("loginuid",  S_IWUSR|S_IRUGO, loginuid),
2776         REG("sessionid",  S_IRUSR, sessionid),
2777 #endif
2778 #ifdef CONFIG_FAULT_INJECTION
2779         REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
2780 #endif
2781 };
2782
2783 static int proc_tid_base_readdir(struct file * filp,
2784                              void * dirent, filldir_t filldir)
2785 {
2786         return proc_pident_readdir(filp,dirent,filldir,
2787                                    tid_base_stuff,ARRAY_SIZE(tid_base_stuff));
2788 }
2789
2790 static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
2791         return proc_pident_lookup(dir, dentry,
2792                                   tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
2793 }
2794
2795 static const struct file_operations proc_tid_base_operations = {
2796         .read           = generic_read_dir,
2797         .readdir        = proc_tid_base_readdir,
2798 };
2799
2800 static const struct inode_operations proc_tid_base_inode_operations = {
2801         .lookup         = proc_tid_base_lookup,
2802         .getattr        = pid_getattr,
2803         .setattr        = proc_setattr,
2804 };
2805
2806 static struct dentry *proc_task_instantiate(struct inode *dir,
2807         struct dentry *dentry, struct task_struct *task, const void *ptr)
2808 {
2809         struct dentry *error = ERR_PTR(-ENOENT);
2810         struct inode *inode;
2811         inode = proc_pid_make_inode(dir->i_sb, task);
2812
2813         if (!inode)
2814                 goto out;
2815         inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
2816         inode->i_op = &proc_tid_base_inode_operations;
2817         inode->i_fop = &proc_tid_base_operations;
2818         inode->i_flags|=S_IMMUTABLE;
2819         inode->i_nlink = 4;
2820 #ifdef CONFIG_SECURITY
2821         inode->i_nlink += 1;
2822 #endif
2823
2824         dentry->d_op = &pid_dentry_operations;
2825
2826         d_add(dentry, inode);
2827         /* Close the race of the process dying before we return the dentry */
2828         if (pid_revalidate(dentry, NULL))
2829                 error = NULL;
2830 out:
2831         return error;
2832 }
2833
2834 static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
2835 {
2836         struct dentry *result = ERR_PTR(-ENOENT);
2837         struct task_struct *task;
2838         struct task_struct *leader = get_proc_task(dir);
2839         unsigned tid;
2840         struct pid_namespace *ns;
2841
2842         if (!leader)
2843                 goto out_no_task;
2844
2845         tid = name_to_int(dentry);
2846         if (tid == ~0U)
2847                 goto out;
2848
2849         ns = dentry->d_sb->s_fs_info;
2850         rcu_read_lock();
2851         task = find_task_by_pid_ns(tid, ns);
2852         if (task)
2853                 get_task_struct(task);
2854         rcu_read_unlock();
2855         if (!task)
2856                 goto out;
2857         if (!same_thread_group(leader, task))
2858                 goto out_drop_task;
2859
2860         result = proc_task_instantiate(dir, dentry, task, NULL);
2861 out_drop_task:
2862         put_task_struct(task);
2863 out:
2864         put_task_struct(leader);
2865 out_no_task:
2866         return result;
2867 }
2868
2869 /*
2870  * Find the first tid of a thread group to return to user space.
2871  *
2872  * Usually this is just the thread group leader, but if the users
2873  * buffer was too small or there was a seek into the middle of the
2874  * directory we have more work todo.
2875  *
2876  * In the case of a short read we start with find_task_by_pid.
2877  *
2878  * In the case of a seek we start with the leader and walk nr
2879  * threads past it.
2880  */
2881 static struct task_struct *first_tid(struct task_struct *leader,
2882                 int tid, int nr, struct pid_namespace *ns)
2883 {
2884         struct task_struct *pos;
2885
2886         rcu_read_lock();
2887         /* Attempt to start with the pid of a thread */
2888         if (tid && (nr > 0)) {
2889                 pos = find_task_by_pid_ns(tid, ns);
2890                 if (pos && (pos->group_leader == leader))
2891                         goto found;
2892         }
2893
2894         /* If nr exceeds the number of threads there is nothing todo */
2895         pos = NULL;
2896         if (nr && nr >= get_nr_threads(leader))
2897                 goto out;
2898
2899         /* If we haven't found our starting place yet start
2900          * with the leader and walk nr threads forward.
2901          */
2902         for (pos = leader; nr > 0; --nr) {
2903                 pos = next_thread(pos);
2904                 if (pos == leader) {
2905                         pos = NULL;
2906                         goto out;
2907                 }
2908         }
2909 found:
2910         get_task_struct(pos);
2911 out:
2912         rcu_read_unlock();
2913         return pos;
2914 }
2915
2916 /*
2917  * Find the next thread in the thread list.
2918  * Return NULL if there is an error or no next thread.
2919  *
2920  * The reference to the input task_struct is released.
2921  */
2922 static struct task_struct *next_tid(struct task_struct *start)
2923 {
2924         struct task_struct *pos = NULL;
2925         rcu_read_lock();
2926         if (pid_alive(start)) {
2927                 pos = next_thread(start);
2928                 if (thread_group_leader(pos))
2929                         pos = NULL;
2930                 else
2931                         get_task_struct(pos);
2932         }
2933         rcu_read_unlock();
2934         put_task_struct(start);
2935         return pos;
2936 }
2937
2938 static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
2939         struct task_struct *task, int tid)
2940 {
2941         char name[PROC_NUMBUF];
2942         int len = snprintf(name, sizeof(name), "%d", tid);
2943         return proc_fill_cache(filp, dirent, filldir, name, len,
2944                                 proc_task_instantiate, task, NULL);
2945 }
2946
2947 /* for the /proc/TGID/task/ directories */
2948 static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir)
2949 {
2950         struct dentry *dentry = filp->f_path.dentry;
2951         struct inode *inode = dentry->d_inode;
2952         struct task_struct *leader = NULL;
2953         struct task_struct *task;
2954         int retval = -ENOENT;
2955         ino_t ino;
2956         int tid;
2957         unsigned long pos = filp->f_pos;  /* avoiding "long long" filp->f_pos */
2958         struct pid_namespace *ns;
2959
2960         task = get_proc_task(inode);
2961         if (!task)
2962                 goto out_no_task;
2963         rcu_read_lock();
2964         if (pid_alive(task)) {
2965                 leader = task->group_leader;
2966                 get_task_struct(leader);
2967         }
2968         rcu_read_unlock();
2969         put_task_struct(task);
2970         if (!leader)
2971                 goto out_no_task;
2972         retval = 0;
2973
2974         switch (pos) {
2975         case 0:
2976                 ino = inode->i_ino;
2977                 if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0)
2978                         goto out;
2979                 pos++;
2980                 /* fall through */
2981         case 1:
2982                 ino = parent_ino(dentry);
2983                 if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0)
2984                         goto out;
2985                 pos++;
2986                 /* fall through */
2987         }
2988
2989         /* f_version caches the tgid value that the last readdir call couldn't
2990          * return. lseek aka telldir automagically resets f_version to 0.
2991          */
2992         ns = filp->f_dentry->d_sb->s_fs_info;
2993         tid = (int)filp->f_version;
2994         filp->f_version = 0;
2995         for (task = first_tid(leader, tid, pos - 2, ns);
2996              task;
2997              task = next_tid(task), pos++) {
2998                 tid = task_pid_nr_ns(task, ns);
2999                 if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) {
3000                         /* returning this tgid failed, save it as the first
3001                          * pid for the next readir call */
3002                         filp->f_version = (u64)tid;
3003                         put_task_struct(task);
3004                         break;
3005                 }
3006         }
3007 out:
3008         filp->f_pos = pos;
3009         put_task_struct(leader);
3010 out_no_task:
3011         return retval;
3012 }
3013
3014 static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
3015 {
3016         struct inode *inode = dentry->d_inode;
3017         struct task_struct *p = get_proc_task(inode);
3018         generic_fillattr(inode, stat);
3019
3020         if (p) {
3021                 rcu_read_lock();
3022                 stat->nlink += get_nr_threads(p);
3023                 rcu_read_unlock();
3024                 put_task_struct(p);
3025         }
3026
3027         return 0;
3028 }
3029
3030 static const struct inode_operations proc_task_inode_operations = {
3031         .lookup         = proc_task_lookup,
3032         .getattr        = proc_task_getattr,
3033         .setattr        = proc_setattr,
3034 };
3035
3036 static const struct file_operations proc_task_operations = {
3037         .read           = generic_read_dir,
3038         .readdir        = proc_task_readdir,
3039 };