fs/proc/base.c

   1 /*
   2  *  linux/fs/proc/base.c
   3  *
   4  *  Copyright (C) 1991, 1992 Linus Torvalds
   5  *
   6  *  proc base directory handling functions
   7  *
   8  *  1999, Al Viro. Rewritten. Now it covers the whole per-process part.
   9  *  Instead of using magical inumbers to determine the kind of object
  10  *  we allocate and fill in-core inodes upon lookup. They don't even
  11  *  go into icache. We cache the reference to task_struct upon lookup too.
  12  *  Eventually it should become a filesystem in its own. We don't use the
  13  *  rest of procfs anymore.
  14  *
  15  *
  16  *  Changelog:
  17  *  17-Jan-2005
  18  *  Allan Bezerra
  19  *  Bruna Moreira <bruna.moreira@indt.org.br>
  20  *  Edjard Mota <edjard.mota@indt.org.br>
  21  *  Ilias Biris <ilias.biris@indt.org.br>
  22  *  Mauricio Lin <mauricio.lin@indt.org.br>
  23  *
  24  *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
  25  *
  26  *  A new process specific entry (smaps) included in /proc. It shows the
  27  *  size of rss for each memory area. The maps entry lacks information
  28  *  about physical memory size (rss) for each mapped file, i.e.,
  29  *  rss information for executables and library files.
  30  *  This additional information is useful for any tools that need to know
  31  *  about physical memory consumption for a process specific library.
  32  *
  33  *  Changelog:
  34  *  21-Feb-2005
  35  *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
  36  *  Pud inclusion in the page table walking.
  37  *
  38  *  ChangeLog:
  39  *  10-Mar-2005
  40  *  10LE Instituto Nokia de Tecnologia - INdT:
  41  *  A better way to walks through the page table as suggested by Hugh Dickins.
  42  *
  43  *  Simo Piiroinen <simo.piiroinen@nokia.com>:
  44  *  Smaps information related to shared, private, clean and dirty pages.
  45  *
  46  *  Paul Mundt <paul.mundt@nokia.com>:
  47  *  Overall revision about smaps.
  48  */
  49
  50 #include <asm/uaccess.h>
  51
  52 #include <linux/errno.h>
  53 #include <linux/time.h>
  54 #include <linux/proc_fs.h>
  55 #include <linux/stat.h>
  56 #include <linux/task_io_accounting_ops.h>
  57 #include <linux/init.h>
  58 #include <linux/capability.h>
  59 #include <linux/file.h>
  60 #include <linux/fdtable.h>
  61 #include <linux/string.h>
  62 #include <linux/seq_file.h>
  63 #include <linux/namei.h>
  64 #include <linux/mnt_namespace.h>
  65 #include <linux/mm.h>
  66 #include <linux/rcupdate.h>
  67 #include <linux/kallsyms.h>
  68 #include <linux/stacktrace.h>
  69 #include <linux/resource.h>
  70 #include <linux/module.h>
  71 #include <linux/mount.h>
  72 #include <linux/security.h>
  73 #include <linux/ptrace.h>
  74 #include <linux/tracehook.h>
  75 #include <linux/cgroup.h>
  76 #include <linux/cpuset.h>
  77 #include <linux/audit.h>
  78 #include <linux/poll.h>
  79 #include <linux/nsproxy.h>
  80 #include <linux/oom.h>
  81 #include <linux/elf.h>
  82 #include <linux/pid_namespace.h>
  83 #include <linux/fs_struct.h>
  84 #include "internal.h"
  85
  86 /* NOTE:
  87  *      Implementing inode permission operations in /proc is almost
  88  *      certainly an error.  Permission checks need to happen during
  89  *      each system call not at open time.  The reason is that most of
  90  *      what we wish to check for permissions in /proc varies at runtime.
  91  *
  92  *      The classic example of a problem is opening file descriptors
  93  *      in /proc for a task before it execs a suid executable.
  94  */
  95
  96 struct pid_entry {
  97         char *name;
  98         int len;
  99         mode_t mode;
 100         const struct inode_operations *iop;
 101         const struct file_operations *fop;
 102         union proc_op op;
 103 };
 104
 105 #define NOD(NAME, MODE, IOP, FOP, OP) {                 \
 106         .name = (NAME),                                 \
 107         .len  = sizeof(NAME) - 1,                       \
 108         .mode = MODE,                                   \
 109         .iop  = IOP,                                    \
 110         .fop  = FOP,                                    \
 111         .op   = OP,                                     \
 112 }
 113
 114 #define DIR(NAME, MODE, iops, fops)     \
 115         NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} )
 116 #define LNK(NAME, get_link)                                     \
 117         NOD(NAME, (S_IFLNK|S_IRWXUGO),                          \
 118                 &proc_pid_link_inode_operations, NULL,          \
 119                 { .proc_get_link = get_link } )
 120 #define REG(NAME, MODE, fops)                           \
 121         NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {})
 122 #define INF(NAME, MODE, read)                           \
 123         NOD(NAME, (S_IFREG|(MODE)),                     \
 124                 NULL, &proc_info_file_operations,       \
 125                 { .proc_read = read } )
 126 #define ONE(NAME, MODE, show)                           \
 127         NOD(NAME, (S_IFREG|(MODE)),                     \
 128                 NULL, &proc_single_file_operations,     \
 129                 { .proc_show = show } )
 130
 131 /*
 132  * Count the number of hardlinks for the pid_entry table, excluding the .
 133  * and .. links.
 134  */
 135 static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
 136         unsigned int n)
 137 {
 138         unsigned int i;
 139         unsigned int count;
 140
 141         count = 0;
 142         for (i = 0; i < n; ++i) {
 143                 if (S_ISDIR(entries[i].mode))
 144                         ++count;
 145         }
 146
 147         return count;
 148 }
 149
 150 static int get_fs_path(struct task_struct *task, struct path *path, bool root)
 151 {
 152         struct fs_struct *fs;
 153         int result = -ENOENT;
 154
 155         task_lock(task);
 156         fs = task->fs;
 157         if (fs) {
 158                 read_lock(&fs->lock);
 159                 *path = root ? fs->root : fs->pwd;
 160                 path_get(path);
 161                 read_unlock(&fs->lock);
 162                 result = 0;
 163         }
 164         task_unlock(task);
 165         return result;
 166 }
 167
 168 static int get_nr_threads(struct task_struct *tsk)
 169 {
 170         unsigned long flags;
 171         int count = 0;
 172
 173         if (lock_task_sighand(tsk, &flags)) {
 174                 count = atomic_read(&tsk->signal->count);
 175                 unlock_task_sighand(tsk, &flags);
 176         }
 177         return count;
 178 }
 179
 180 static int proc_cwd_link(struct inode *inode, struct path *path)
 181 {
 182         struct task_struct *task = get_proc_task(inode);
 183         int result = -ENOENT;
 184
 185         if (task) {
 186                 result = get_fs_path(task, path, 0);
 187                 put_task_struct(task);
 188         }
 189         return result;
 190 }
 191
 192 static int proc_root_link(struct inode *inode, struct path *path)
 193 {
 194         struct task_struct *task = get_proc_task(inode);
 195         int result = -ENOENT;
 196
 197         if (task) {
 198                 result = get_fs_path(task, path, 1);
 199                 put_task_struct(task);
 200         }
 201         return result;
 202 }
 203
 204 /*
 205  * Return zero if current may access user memory in @task, -error if not.
 206  */
 207 static int check_mem_permission(struct task_struct *task)
 208 {
 209         /*
 210          * A task can always look at itself, in case it chooses
 211          * to use system calls instead of load instructions.
 212          */
 213         if (task == current)
 214                 return 0;
 215
 216         /*
 217          * If current is actively ptrace'ing, and would also be
 218          * permitted to freshly attach with ptrace now, permit it.
 219          */
 220         if (task_is_stopped_or_traced(task)) {
 221                 int match;
 222                 rcu_read_lock();
 223                 match = (tracehook_tracer_task(task) == current);
 224                 rcu_read_unlock();
 225                 if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
 226                         return 0;
 227         }
 228
 229         /*
 230          * Noone else is allowed.
 231          */
 232         return -EPERM;
 233 }
 234
 235 struct mm_struct *mm_for_maps(struct task_struct *task)
 236 {
 237         struct mm_struct *mm;
 238
 239         if (mutex_lock_killable(&task->cred_guard_mutex))
 240                 return NULL;
 241
 242         mm = get_task_mm(task);
 243         if (mm && mm != current->mm &&
 244                         !ptrace_may_access(task, PTRACE_MODE_READ)) {
 245                 mmput(mm);
 246                 mm = NULL;
 247         }
 248         mutex_unlock(&task->cred_guard_mutex);
 249
 250         return mm;
 251 }
 252
 253 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
 254 {
 255         int res = 0;
 256         unsigned int len;
 257         struct mm_struct *mm = get_task_mm(task);
 258         if (!mm)
 259                 goto out;
 260         if (!mm->arg_end)
 261                 goto out_mm;    /* Shh! No looking before we're done */
 262
 263         len = mm->arg_end - mm->arg_start;
 264
 265         if (len > PAGE_SIZE)
 266                 len = PAGE_SIZE;
 267
 268         res = access_process_vm(task, mm->arg_start, buffer, len, 0);
 269
 270         // If the nul at the end of args has been overwritten, then
 271         // assume application is using setproctitle(3).
 272         if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) {
 273                 len = strnlen(buffer, res);
 274                 if (len < res) {
 275                     res = len;
 276                 } else {
 277                         len = mm->env_end - mm->env_start;
 278                         if (len > PAGE_SIZE - res)
 279                                 len = PAGE_SIZE - res;
 280                         res += access_process_vm(task, mm->env_start, buffer+res, len, 0);
 281                         res = strnlen(buffer, res);
 282                 }
 283         }
 284 out_mm:
 285         mmput(mm);
 286 out:
 287         return res;
 288 }
 289
 290 static int proc_pid_auxv(struct task_struct *task, char *buffer)
 291 {
 292         int res = 0;
 293         struct mm_struct *mm = get_task_mm(task);
 294         if (mm) {
 295                 unsigned int nwords = 0;
 296                 do {
 297                         nwords += 2;
 298                 } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
 299                 res = nwords * sizeof(mm->saved_auxv[0]);
 300                 if (res > PAGE_SIZE)
 301                         res = PAGE_SIZE;
 302                 memcpy(buffer, mm->saved_auxv, res);
 303                 mmput(mm);
 304         }
 305         return res;
 306 }
 307
 308
 309 #ifdef CONFIG_KALLSYMS
 310 /*
 311  * Provides a wchan file via kallsyms in a proper one-value-per-file format.
 312  * Returns the resolved symbol.  If that fails, simply return the address.
 313  */
 314 static int proc_pid_wchan(struct task_struct *task, char *buffer)
 315 {
 316         unsigned long wchan;
 317         char symname[KSYM_NAME_LEN];
 318
 319         wchan = get_wchan(task);
 320
 321         if (lookup_symbol_name(wchan, symname) < 0)
 322                 if (!ptrace_may_access(task, PTRACE_MODE_READ))
 323                         return 0;
 324                 else
 325                         return sprintf(buffer, "%lu", wchan);
 326         else
 327                 return sprintf(buffer, "%s", symname);
 328 }
 329 #endif /* CONFIG_KALLSYMS */
 330
 331 static int lock_trace(struct task_struct *task)
 332 {
 333         int err = mutex_lock_killable(&task->cred_guard_mutex);
 334         if (err)
 335                 return err;
 336         if (!ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
 337                 mutex_unlock(&task->cred_guard_mutex);
 338                 return -EPERM;
 339         }
 340         return 0;
 341 }
 342
 343 static void unlock_trace(struct task_struct *task)
 344 {
 345         mutex_unlock(&task->cred_guard_mutex);
 346 }
 347
 348 #ifdef CONFIG_STACKTRACE
 349
 350 #define MAX_STACK_TRACE_DEPTH   64
 351
 352 static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
 353                           struct pid *pid, struct task_struct *task)
 354 {
 355         struct stack_trace trace;
 356         unsigned long *entries;
 357         int err;
 358         int i;
 359
 360         entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL);
 361         if (!entries)
 362                 return -ENOMEM;
 363
 364         trace.nr_entries        = 0;
 365         trace.max_entries       = MAX_STACK_TRACE_DEPTH;
 366         trace.entries           = entries;
 367         trace.skip              = 0;
 368
 369         err = lock_trace(task);
 370         if (!err) {
 371                 save_stack_trace_tsk(task, &trace);
 372
 373                 for (i = 0; i < trace.nr_entries; i++) {
 374                         seq_printf(m, "[<%p>] %pS\n",
 375                                    (void *)entries[i], (void *)entries[i]);
 376                 }
 377                 unlock_trace(task);
 378         }
 379         kfree(entries);
 380
 381         return err;
 382 }
 383 #endif
 384
 385 #ifdef CONFIG_SCHEDSTATS
 386 /*
 387  * Provides /proc/PID/schedstat
 388  */
 389 static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 390 {
 391         return sprintf(buffer, "%llu %llu %lu\n",
 392                         (unsigned long long)task->se.sum_exec_runtime,
 393                         (unsigned long long)task->sched_info.run_delay,
 394                         task->sched_info.pcount);
 395 }
 396 #endif
 397
 398 #ifdef CONFIG_LATENCYTOP
 399 static int lstats_show_proc(struct seq_file *m, void *v)
 400 {
 401         int i;
 402         struct inode *inode = m->private;
 403         struct task_struct *task = get_proc_task(inode);
 404
 405         if (!task)
 406                 return -ESRCH;
 407         seq_puts(m, "Latency Top version : v0.1\n");
 408         for (i = 0; i < 32; i++) {
 409                 if (task->latency_record[i].backtrace[0]) {
 410                         int q;
 411                         seq_printf(m, "%i %li %li ",
 412                                 task->latency_record[i].count,
 413                                 task->latency_record[i].time,
 414                                 task->latency_record[i].max);
 415                         for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
 416                                 char sym[KSYM_SYMBOL_LEN];
 417                                 char *c;
 418                                 if (!task->latency_record[i].backtrace[q])
 419                                         break;
 420                                 if (task->latency_record[i].backtrace[q] == ULONG_MAX)
 421                                         break;
 422                                 sprint_symbol(sym, task->latency_record[i].backtrace[q]);
 423                                 c = strchr(sym, '+');
 424                                 if (c)
 425                                         *c = 0;
 426                                 seq_printf(m, "%s ", sym);
 427                         }
 428                         seq_printf(m, "\n");
 429                 }
 430
 431         }
 432         put_task_struct(task);
 433         return 0;
 434 }
 435
 436 static int lstats_open(struct inode *inode, struct file *file)
 437 {
 438         return single_open(file, lstats_show_proc, inode);
 439 }
 440
 441 static ssize_t lstats_write(struct file *file, const char __user *buf,
 442                             size_t count, loff_t *offs)
 443 {
 444         struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
 445
 446         if (!task)
 447                 return -ESRCH;
 448         clear_all_latency_tracing(task);
 449         put_task_struct(task);
 450
 451         return count;
 452 }
 453
 454 static const struct file_operations proc_lstats_operations = {
 455         .open           = lstats_open,
 456         .read           = seq_read,
 457         .write          = lstats_write,
 458         .llseek         = seq_lseek,
 459         .release        = single_release,
 460 };
 461
 462 #endif
 463
 464 /* The badness from the OOM killer */
 465 unsigned long badness(struct task_struct *p, unsigned long uptime);
 466 static int proc_oom_score(struct task_struct *task, char *buffer)
 467 {
 468         unsigned long points = 0;
 469         struct timespec uptime;
 470
 471         do_posix_clock_monotonic_gettime(&uptime);
 472         read_lock(&tasklist_lock);
 473         if (pid_alive(task))
 474                 points = badness(task, uptime.tv_sec);
 475         read_unlock(&tasklist_lock);
 476         return sprintf(buffer, "%lu\n", points);
 477 }
 478
 479 struct limit_names {
 480         char *name;
 481         char *unit;
 482 };
 483
 484 static const struct limit_names lnames[RLIM_NLIMITS] = {
 485         [RLIMIT_CPU] = {"Max cpu time", "seconds"},
 486         [RLIMIT_FSIZE] = {"Max file size", "bytes"},
 487         [RLIMIT_DATA] = {"Max data size", "bytes"},
 488         [RLIMIT_STACK] = {"Max stack size", "bytes"},
 489         [RLIMIT_CORE] = {"Max core file size", "bytes"},
 490         [RLIMIT_RSS] = {"Max resident set", "bytes"},
 491         [RLIMIT_NPROC] = {"Max processes", "processes"},
 492         [RLIMIT_NOFILE] = {"Max open files", "files"},
 493         [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"},
 494         [RLIMIT_AS] = {"Max address space", "bytes"},
 495         [RLIMIT_LOCKS] = {"Max file locks", "locks"},
 496         [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"},
 497         [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
 498         [RLIMIT_NICE] = {"Max nice priority", NULL},
 499         [RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
 500         [RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
 501 };
 502
 503 /* Display limits for a process */
 504 static int proc_pid_limits(struct task_struct *task, char *buffer)
 505 {
 506         unsigned int i;
 507         int count = 0;
 508         unsigned long flags;
 509         char *bufptr = buffer;
 510
 511         struct rlimit rlim[RLIM_NLIMITS];
 512
 513         if (!lock_task_sighand(task, &flags))
 514                 return 0;
 515         memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
 516         unlock_task_sighand(task, &flags);
 517
 518         /*
 519          * print the file header
 520          */
 521         count += sprintf(&bufptr[count], "%-25s %-20s %-20s %-10s\n",
 522                         "Limit", "Soft Limit", "Hard Limit", "Units");
 523
 524         for (i = 0; i < RLIM_NLIMITS; i++) {
 525                 if (rlim[i].rlim_cur == RLIM_INFINITY)
 526                         count += sprintf(&bufptr[count], "%-25s %-20s ",
 527                                          lnames[i].name, "unlimited");
 528                 else
 529                         count += sprintf(&bufptr[count], "%-25s %-20lu ",
 530                                          lnames[i].name, rlim[i].rlim_cur);
 531
 532                 if (rlim[i].rlim_max == RLIM_INFINITY)
 533                         count += sprintf(&bufptr[count], "%-20s ", "unlimited");
 534                 else
 535                         count += sprintf(&bufptr[count], "%-20lu ",
 536                                          rlim[i].rlim_max);
 537
 538                 if (lnames[i].unit)
 539                         count += sprintf(&bufptr[count], "%-10s\n",
 540                                          lnames[i].unit);
 541                 else
 542                         count += sprintf(&bufptr[count], "\n");
 543         }
 544
 545         return count;
 546 }
 547
 548 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
 549 static int proc_pid_syscall(struct task_struct *task, char *buffer)
 550 {
 551         long nr;
 552         unsigned long args[6], sp, pc;
 553         int res = lock_trace(task);
 554         if (res)
 555                 return res;
 556
 557         if (task_current_syscall(task, &nr, args, 6, &sp, &pc))
 558                 res = sprintf(buffer, "running\n");
 559         else if (nr < 0)
 560                 res = sprintf(buffer, "%ld 0x%lx 0x%lx\n", nr, sp, pc);
 561         else
 562                 res = sprintf(buffer,
 563                        "%ld 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx 0x%lx\n",
 564                        nr,
 565                        args[0], args[1], args[2], args[3], args[4], args[5],
 566                        sp, pc);
 567         unlock_trace(task);
 568         return res;
 569 }
 570 #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */
 571
 572 /************************************************************************/
 573 /*                       Here the fs part begins                        */
 574 /************************************************************************/
 575
 576 /* permission checks */
 577 static int proc_fd_access_allowed(struct inode *inode)
 578 {
 579         struct task_struct *task;
 580         int allowed = 0;
 581         /* Allow access to a task's file descriptors if it is us or we
 582          * may use ptrace attach to the process and find out that
 583          * information.
 584          */
 585         task = get_proc_task(inode);
 586         if (task) {
 587                 allowed = ptrace_may_access(task, PTRACE_MODE_READ);
 588                 put_task_struct(task);
 589         }
 590         return allowed;
 591 }
 592
 593 static int proc_setattr(struct dentry *dentry, struct iattr *attr)
 594 {
 595         int error;
 596         struct inode *inode = dentry->d_inode;
 597
 598         if (attr->ia_valid & ATTR_MODE)
 599                 return -EPERM;
 600
 601         error = inode_change_ok(inode, attr);
 602         if (!error)
 603                 error = inode_setattr(inode, attr);
 604         return error;
 605 }
 606
 607 static const struct inode_operations proc_def_inode_operations = {
 608         .setattr        = proc_setattr,
 609 };
 610
 611 static int mounts_open_common(struct inode *inode, struct file *file,
 612                               const struct seq_operations *op)
 613 {
 614         struct task_struct *task = get_proc_task(inode);
 615         struct nsproxy *nsp;
 616         struct mnt_namespace *ns = NULL;
 617         struct path root;
 618         struct proc_mounts *p;
 619         int ret = -EINVAL;
 620
 621         if (task) {
 622                 rcu_read_lock();
 623                 nsp = task_nsproxy(task);
 624                 if (nsp) {
 625                         ns = nsp->mnt_ns;
 626                         if (ns)
 627                                 get_mnt_ns(ns);
 628                 }
 629                 rcu_read_unlock();
 630                 if (ns && get_fs_path(task, &root, 1) == 0)
 631                         ret = 0;
 632                 put_task_struct(task);
 633         }
 634
 635         if (!ns)
 636                 goto err;
 637         if (ret)
 638                 goto err_put_ns;
 639
 640         ret = -ENOMEM;
 641         p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
 642         if (!p)
 643                 goto err_put_path;
 644
 645         file->private_data = &p->m;
 646         ret = seq_open(file, op);
 647         if (ret)
 648                 goto err_free;
 649
 650         p->m.private = p;
 651         p->ns = ns;
 652         p->root = root;
 653         p->event = ns->event;
 654
 655         return 0;
 656
 657  err_free:
 658         kfree(p);
 659  err_put_path:
 660         path_put(&root);
 661  err_put_ns:
 662         put_mnt_ns(ns);
 663  err:
 664         return ret;
 665 }
 666
 667 static int mounts_release(struct inode *inode, struct file *file)
 668 {
 669         struct proc_mounts *p = file->private_data;
 670         path_put(&p->root);
 671         put_mnt_ns(p->ns);
 672         return seq_release(inode, file);
 673 }
 674
 675 static unsigned mounts_poll(struct file *file, poll_table *wait)
 676 {
 677         struct proc_mounts *p = file->private_data;
 678         struct mnt_namespace *ns = p->ns;
 679         unsigned res = POLLIN | POLLRDNORM;
 680
 681         poll_wait(file, &ns->poll, wait);
 682
 683         spin_lock(&vfsmount_lock);
 684         if (p->event != ns->event) {
 685                 p->event = ns->event;
 686                 res |= POLLERR | POLLPRI;
 687         }
 688         spin_unlock(&vfsmount_lock);
 689
 690         return res;
 691 }
 692
 693 static int mounts_open(struct inode *inode, struct file *file)
 694 {
 695         return mounts_open_common(inode, file, &mounts_op);
 696 }
 697
 698 static const struct file_operations proc_mounts_operations = {
 699         .open           = mounts_open,
 700         .read           = seq_read,
 701         .llseek         = seq_lseek,
 702         .release        = mounts_release,
 703         .poll           = mounts_poll,
 704 };
 705
 706 static int mountinfo_open(struct inode *inode, struct file *file)
 707 {
 708         return mounts_open_common(inode, file, &mountinfo_op);
 709 }
 710
 711 static const struct file_operations proc_mountinfo_operations = {
 712         .open           = mountinfo_open,
 713         .read           = seq_read,
 714         .llseek         = seq_lseek,
 715         .release        = mounts_release,
 716         .poll           = mounts_poll,
 717 };
 718
 719 static int mountstats_open(struct inode *inode, struct file *file)
 720 {
 721         return mounts_open_common(inode, file, &mountstats_op);
 722 }
 723
 724 static const struct file_operations proc_mountstats_operations = {
 725         .open           = mountstats_open,
 726         .read           = seq_read,
 727         .llseek         = seq_lseek,
 728         .release        = mounts_release,
 729 };
 730
 731 #define PROC_BLOCK_SIZE (3*1024)                /* 4K page size but our output routines use some slack for overruns */
 732
 733 static ssize_t proc_info_read(struct file * file, char __user * buf,
 734                           size_t count, loff_t *ppos)
 735 {
 736         struct inode * inode = file->f_path.dentry->d_inode;
 737         unsigned long page;
 738         ssize_t length;
 739         struct task_struct *task = get_proc_task(inode);
 740
 741         length = -ESRCH;
 742         if (!task)
 743                 goto out_no_task;
 744
 745         if (count > PROC_BLOCK_SIZE)
 746                 count = PROC_BLOCK_SIZE;
 747
 748         length = -ENOMEM;
 749         if (!(page = __get_free_page(GFP_TEMPORARY)))
 750                 goto out;
 751
 752         length = PROC_I(inode)->op.proc_read(task, (char*)page);
 753
 754         if (length >= 0)
 755                 length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
 756         free_page(page);
 757 out:
 758         put_task_struct(task);
 759 out_no_task:
 760         return length;
 761 }
 762
 763 static const struct file_operations proc_info_file_operations = {
 764         .read           = proc_info_read,
 765 };
 766
 767 static int proc_single_show(struct seq_file *m, void *v)
 768 {
 769         struct inode *inode = m->private;
 770         struct pid_namespace *ns;
 771         struct pid *pid;
 772         struct task_struct *task;
 773         int ret;
 774
 775         ns = inode->i_sb->s_fs_info;
 776         pid = proc_pid(inode);
 777         task = get_pid_task(pid, PIDTYPE_PID);
 778         if (!task)
 779                 return -ESRCH;
 780
 781         ret = PROC_I(inode)->op.proc_show(m, ns, pid, task);
 782
 783         put_task_struct(task);
 784         return ret;
 785 }
 786
 787 static int proc_single_open(struct inode *inode, struct file *filp)
 788 {
 789         int ret;
 790         ret = single_open(filp, proc_single_show, NULL);
 791         if (!ret) {
 792                 struct seq_file *m = filp->private_data;
 793
 794                 m->private = inode;
 795         }
 796         return ret;
 797 }
 798
 799 static const struct file_operations proc_single_file_operations = {
 800         .open           = proc_single_open,
 801         .read           = seq_read,
 802         .llseek         = seq_lseek,
 803         .release        = single_release,
 804 };
 805
 806 static int mem_open(struct inode* inode, struct file* file)
 807 {
 808         file->private_data = (void*)((long)current->self_exec_id);
 809         return 0;
 810 }
 811
 812 static ssize_t mem_read(struct file * file, char __user * buf,
 813                         size_t count, loff_t *ppos)
 814 {
 815         struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
 816         char *page;
 817         unsigned long src = *ppos;
 818         int ret = -ESRCH;
 819         struct mm_struct *mm;
 820
 821         if (!task)
 822                 goto out_no_task;
 823
 824         if (check_mem_permission(task))
 825                 goto out;
 826
 827         ret = -ENOMEM;
 828         page = (char *)__get_free_page(GFP_TEMPORARY);
 829         if (!page)
 830                 goto out;
 831
 832         ret = 0;
 833
 834         mm = get_task_mm(task);
 835         if (!mm)
 836                 goto out_free;
 837
 838         ret = -EIO;
 839
 840         if (file->private_data != (void*)((long)current->self_exec_id))
 841                 goto out_put;
 842
 843         ret = 0;
 844
 845         while (count > 0) {
 846                 int this_len, retval;
 847
 848                 this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
 849                 retval = access_process_vm(task, src, page, this_len, 0);
 850                 if (!retval || check_mem_permission(task)) {
 851                         if (!ret)
 852                                 ret = -EIO;
 853                         break;
 854                 }
 855
 856                 if (copy_to_user(buf, page, retval)) {
 857                         ret = -EFAULT;
 858                         break;
 859                 }
 860
 861                 ret += retval;
 862                 src += retval;
 863                 buf += retval;
 864                 count -= retval;
 865         }
 866         *ppos = src;
 867
 868 out_put:
 869         mmput(mm);
 870 out_free:
 871         free_page((unsigned long) page);
 872 out:
 873         put_task_struct(task);
 874 out_no_task:
 875         return ret;
 876 }
 877
 878 #define mem_write NULL
 879
 880 #ifndef mem_write
 881 /* This is a security hazard */
 882 static ssize_t mem_write(struct file * file, const char __user *buf,
 883                          size_t count, loff_t *ppos)
 884 {
 885         int copied;
 886         char *page;
 887         struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
 888         unsigned long dst = *ppos;
 889
 890         copied = -ESRCH;
 891         if (!task)
 892                 goto out_no_task;
 893
 894         if (check_mem_permission(task))
 895                 goto out;
 896
 897         copied = -ENOMEM;
 898         page = (char *)__get_free_page(GFP_TEMPORARY);
 899         if (!page)
 900                 goto out;
 901
 902         copied = 0;
 903         while (count > 0) {
 904                 int this_len, retval;
 905
 906                 this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
 907                 if (copy_from_user(page, buf, this_len)) {
 908                         copied = -EFAULT;
 909                         break;
 910                 }
 911                 retval = access_process_vm(task, dst, page, this_len, 1);
 912                 if (!retval) {
 913                         if (!copied)
 914                                 copied = -EIO;
 915                         break;
 916                 }
 917                 copied += retval;
 918                 buf += retval;
 919                 dst += retval;
 920                 count -= retval;
 921         }
 922         *ppos = dst;
 923         free_page((unsigned long) page);
 924 out:
 925         put_task_struct(task);
 926 out_no_task:
 927         return copied;
 928 }
 929 #endif
 930
 931 loff_t mem_lseek(struct file *file, loff_t offset, int orig)
 932 {
 933         switch (orig) {
 934         case 0:
 935                 file->f_pos = offset;
 936                 break;
 937         case 1:
 938                 file->f_pos += offset;
 939                 break;
 940         default:
 941                 return -EINVAL;
 942         }
 943         force_successful_syscall_return();
 944         return file->f_pos;
 945 }
 946
 947 static const struct file_operations proc_mem_operations = {
 948         .llseek         = mem_lseek,
 949         .read           = mem_read,
 950         .write          = mem_write,
 951         .open           = mem_open,
 952 };
 953
 954 static ssize_t environ_read(struct file *file, char __user *buf,
 955                         size_t count, loff_t *ppos)
 956 {
 957         struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
 958         char *page;
 959         unsigned long src = *ppos;
 960         int ret = -ESRCH;
 961         struct mm_struct *mm;
 962
 963         if (!task)
 964                 goto out_no_task;
 965
 966         if (!ptrace_may_access(task, PTRACE_MODE_READ))
 967                 goto out;
 968
 969         ret = -ENOMEM;
 970         page = (char *)__get_free_page(GFP_TEMPORARY);
 971         if (!page)
 972                 goto out;
 973
 974         ret = 0;
 975
 976         mm = get_task_mm(task);
 977         if (!mm)
 978                 goto out_free;
 979
 980         while (count > 0) {
 981                 int this_len, retval, max_len;
 982
 983                 this_len = mm->env_end - (mm->env_start + src);
 984
 985                 if (this_len <= 0)
 986                         break;
 987
 988                 max_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
 989                 this_len = (this_len > max_len) ? max_len : this_len;
 990
 991                 retval = access_process_vm(task, (mm->env_start + src),
 992                         page, this_len, 0);
 993
 994                 if (retval <= 0) {
 995                         ret = retval;
 996                         break;
 997                 }
 998
 999                 if (copy_to_user(buf, page, retval)) {
1000                         ret = -EFAULT;
1001                         break;
1002                 }
1003
1004                 ret += retval;
1005                 src += retval;
1006                 buf += retval;
1007                 count -= retval;
1008         }
1009         *ppos = src;
1010
1011         mmput(mm);
1012 out_free:
1013         free_page((unsigned long) page);
1014 out:
1015         put_task_struct(task);
1016 out_no_task:
1017         return ret;
1018 }
1019
1020 static const struct file_operations proc_environ_operations = {
1021         .read           = environ_read,
1022 };
1023
1024 static ssize_t oom_adjust_read(struct file *file, char __user *buf,
1025                                 size_t count, loff_t *ppos)
1026 {
1027         struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
1028         char buffer[PROC_NUMBUF];
1029         size_t len;
1030         int oom_adjust = OOM_DISABLE;
1031         unsigned long flags;
1032
1033         if (!task)
1034                 return -ESRCH;
1035
1036         if (lock_task_sighand(task, &flags)) {
1037                 oom_adjust = task->signal->oom_adj;
1038                 unlock_task_sighand(task, &flags);
1039         }
1040
1041         put_task_struct(task);
1042
1043         len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
1044
1045         return simple_read_from_buffer(buf, count, ppos, buffer, len);
1046 }
1047
1048 static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1049                                 size_t count, loff_t *ppos)
1050 {
1051         struct task_struct *task;
1052         char buffer[PROC_NUMBUF];
1053         long oom_adjust;
1054         unsigned long flags;
1055         int err;
1056
1057         memset(buffer, 0, sizeof(buffer));
1058         if (count > sizeof(buffer) - 1)
1059                 count = sizeof(buffer) - 1;
1060         if (copy_from_user(buffer, buf, count))
1061                 return -EFAULT;
1062
1063         err = strict_strtol(strstrip(buffer), 0, &oom_adjust);
1064         if (err)
1065                 return -EINVAL;
1066         if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
1067              oom_adjust != OOM_DISABLE)
1068                 return -EINVAL;
1069
1070         task = get_proc_task(file->f_path.dentry->d_inode);
1071         if (!task)
1072                 return -ESRCH;
1073         if (!lock_task_sighand(task, &flags)) {
1074                 put_task_struct(task);
1075                 return -ESRCH;
1076         }
1077
1078         if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
1079                 unlock_task_sighand(task, &flags);
1080                 put_task_struct(task);
1081                 return -EACCES;
1082         }
1083
1084         task->signal->oom_adj = oom_adjust;
1085
1086         unlock_task_sighand(task, &flags);
1087         put_task_struct(task);
1088
1089         return count;
1090 }
1091
1092 static const struct file_operations proc_oom_adjust_operations = {
1093         .read           = oom_adjust_read,
1094         .write          = oom_adjust_write,
1095 };
1096
1097 #ifdef CONFIG_AUDITSYSCALL
1098 #define TMPBUFLEN 21
1099 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
1100                                   size_t count, loff_t *ppos)
1101 {
1102         struct inode * inode = file->f_path.dentry->d_inode;
1103         struct task_struct *task = get_proc_task(inode);
1104         ssize_t length;
1105         char tmpbuf[TMPBUFLEN];
1106
1107         if (!task)
1108                 return -ESRCH;
1109         length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1110                                 audit_get_loginuid(task));
1111         put_task_struct(task);
1112         return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1113 }
1114
1115 static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1116                                    size_t count, loff_t *ppos)
1117 {
1118         struct inode * inode = file->f_path.dentry->d_inode;
1119         char *page, *tmp;
1120         ssize_t length;
1121         uid_t loginuid;
1122
1123         if (!capable(CAP_AUDIT_CONTROL))
1124                 return -EPERM;
1125
1126         if (current != pid_task(proc_pid(inode), PIDTYPE_PID))
1127                 return -EPERM;
1128
1129         if (count >= PAGE_SIZE)
1130                 count = PAGE_SIZE - 1;
1131
1132         if (*ppos != 0) {
1133                 /* No partial writes. */
1134                 return -EINVAL;
1135         }
1136         page = (char*)__get_free_page(GFP_TEMPORARY);
1137         if (!page)
1138                 return -ENOMEM;
1139         length = -EFAULT;
1140         if (copy_from_user(page, buf, count))
1141                 goto out_free_page;
1142
1143         page[count] = '\0';
1144         loginuid = simple_strtoul(page, &tmp, 10);
1145         if (tmp == page) {
1146                 length = -EINVAL;
1147                 goto out_free_page;
1148
1149         }
1150         length = audit_set_loginuid(current, loginuid);
1151         if (likely(length == 0))
1152                 length = count;
1153
1154 out_free_page:
1155         free_page((unsigned long) page);
1156         return length;
1157 }
1158
1159 static const struct file_operations proc_loginuid_operations = {
1160         .read           = proc_loginuid_read,
1161         .write          = proc_loginuid_write,
1162 };
1163
1164 static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
1165                                   size_t count, loff_t *ppos)
1166 {
1167         struct inode * inode = file->f_path.dentry->d_inode;
1168         struct task_struct *task = get_proc_task(inode);
1169         ssize_t length;
1170         char tmpbuf[TMPBUFLEN];
1171
1172         if (!task)
1173                 return -ESRCH;
1174         length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1175                                 audit_get_sessionid(task));
1176         put_task_struct(task);
1177         return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1178 }
1179
1180 static const struct file_operations proc_sessionid_operations = {
1181         .read           = proc_sessionid_read,
1182 };
1183 #endif
1184
1185 #ifdef CONFIG_FAULT_INJECTION
1186 static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
1187                                       size_t count, loff_t *ppos)
1188 {
1189         struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
1190         char buffer[PROC_NUMBUF];
1191         size_t len;
1192         int make_it_fail;
1193
1194         if (!task)
1195                 return -ESRCH;
1196         make_it_fail = task->make_it_fail;
1197         put_task_struct(task);
1198
1199         len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);
1200
1201         return simple_read_from_buffer(buf, count, ppos, buffer, len);
1202 }
1203
1204 static ssize_t proc_fault_inject_write(struct file * file,
1205                         const char __user * buf, size_t count, loff_t *ppos)
1206 {
1207         struct task_struct *task;
1208         char buffer[PROC_NUMBUF], *end;
1209         int make_it_fail;
1210
1211         if (!capable(CAP_SYS_RESOURCE))
1212                 return -EPERM;
1213         memset(buffer, 0, sizeof(buffer));
1214         if (count > sizeof(buffer) - 1)
1215                 count = sizeof(buffer) - 1;
1216         if (copy_from_user(buffer, buf, count))
1217                 return -EFAULT;
1218         make_it_fail = simple_strtol(strstrip(buffer), &end, 0);
1219         if (*end)
1220                 return -EINVAL;
1221         task = get_proc_task(file->f_dentry->d_inode);
1222         if (!task)
1223                 return -ESRCH;
1224         task->make_it_fail = make_it_fail;
1225         put_task_struct(task);
1226
1227         return count;
1228 }
1229
1230 static const struct file_operations proc_fault_inject_operations = {
1231         .read           = proc_fault_inject_read,
1232         .write          = proc_fault_inject_write,
1233 };
1234 #endif
1235
1236
1237 #ifdef CONFIG_SCHED_DEBUG
1238 /*
1239  * Print out various scheduling related per-task fields:
1240  */
1241 static int sched_show(struct seq_file *m, void *v)
1242 {
1243         struct inode *inode = m->private;
1244         struct task_struct *p;
1245
1246         p = get_proc_task(inode);
1247         if (!p)
1248                 return -ESRCH;
1249         proc_sched_show_task(p, m);
1250
1251         put_task_struct(p);
1252
1253         return 0;
1254 }
1255
1256 static ssize_t
1257 sched_write(struct file *file, const char __user *buf,
1258             size_t count, loff_t *offset)
1259 {
1260         struct inode *inode = file->f_path.dentry->d_inode;
1261         struct task_struct *p;
1262
1263         p = get_proc_task(inode);
1264         if (!p)
1265                 return -ESRCH;
1266         proc_sched_set_task(p);
1267
1268         put_task_struct(p);
1269
1270         return count;
1271 }
1272
1273 static int sched_open(struct inode *inode, struct file *filp)
1274 {
1275         int ret;
1276
1277         ret = single_open(filp, sched_show, NULL);
1278         if (!ret) {
1279                 struct seq_file *m = filp->private_data;
1280
1281                 m->private = inode;
1282         }
1283         return ret;
1284 }
1285
1286 static const struct file_operations proc_pid_sched_operations = {
1287         .open           = sched_open,
1288         .read           = seq_read,
1289         .write          = sched_write,
1290         .llseek         = seq_lseek,
1291         .release        = single_release,
1292 };
1293
1294 #endif
1295
1296 /*
1297  * We added or removed a vma mapping the executable. The vmas are only mapped
1298  * during exec and are not mapped with the mmap system call.
1299  * Callers must hold down_write() on the mm's mmap_sem for these
1300  */
1301 void added_exe_file_vma(struct mm_struct *mm)
1302 {
1303         mm->num_exe_file_vmas++;
1304 }
1305
1306 void removed_exe_file_vma(struct mm_struct *mm)
1307 {
1308         mm->num_exe_file_vmas--;
1309         if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
1310                 fput(mm->exe_file);
1311                 mm->exe_file = NULL;
1312         }
1313
1314 }
1315
1316 void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
1317 {
1318         if (new_exe_file)
1319                 get_file(new_exe_file);
1320         if (mm->exe_file)
1321                 fput(mm->exe_file);
1322         mm->exe_file = new_exe_file;
1323         mm->num_exe_file_vmas = 0;
1324 }
1325
1326 struct file *get_mm_exe_file(struct mm_struct *mm)
1327 {
1328         struct file *exe_file;
1329
1330         /* We need mmap_sem to protect against races with removal of
1331          * VM_EXECUTABLE vmas */
1332         down_read(&mm->mmap_sem);
1333         exe_file = mm->exe_file;
1334         if (exe_file)
1335                 get_file(exe_file);
1336         up_read(&mm->mmap_sem);
1337         return exe_file;
1338 }
1339
1340 void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
1341 {
1342         /* It's safe to write the exe_file pointer without exe_file_lock because
1343          * this is called during fork when the task is not yet in /proc */
1344         newmm->exe_file = get_mm_exe_file(oldmm);
1345 }
1346
1347 static int proc_exe_link(struct inode *inode, struct path *exe_path)
1348 {
1349         struct task_struct *task;
1350         struct mm_struct *mm;
1351         struct file *exe_file;
1352
1353         task = get_proc_task(inode);
1354         if (!task)
1355                 return -ENOENT;
1356         mm = get_task_mm(task);
1357         put_task_struct(task);
1358         if (!mm)
1359                 return -ENOENT;
1360         exe_file = get_mm_exe_file(mm);
1361         mmput(mm);
1362         if (exe_file) {
1363                 *exe_path = exe_file->f_path;
1364                 path_get(&exe_file->f_path);
1365                 fput(exe_file);
1366                 return 0;
1367         } else
1368                 return -ENOENT;
1369 }
1370
1371 static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
1372 {
1373         struct inode *inode = dentry->d_inode;
1374         int error = -EACCES;
1375
1376         /* We don't need a base pointer in the /proc filesystem */
1377         path_put(&nd->path);
1378
1379         /* Are we allowed to snoop on the tasks file descriptors? */
1380         if (!proc_fd_access_allowed(inode))
1381                 goto out;
1382
1383         error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
1384         nd->last_type = LAST_BIND;
1385 out:
1386         return ERR_PTR(error);
1387 }
1388
1389 static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
1390 {
1391         char *tmp = (char*)__get_free_page(GFP_TEMPORARY);
1392         char *pathname;
1393         int len;
1394
1395         if (!tmp)
1396                 return -ENOMEM;
1397
1398         pathname = d_path(path, tmp, PAGE_SIZE);
1399         len = PTR_ERR(pathname);
1400         if (IS_ERR(pathname))
1401                 goto out;
1402         len = tmp + PAGE_SIZE - 1 - pathname;
1403
1404         if (len > buflen)
1405                 len = buflen;
1406         if (copy_to_user(buffer, pathname, len))
1407                 len = -EFAULT;
1408  out:
1409         free_page((unsigned long)tmp);
1410         return len;
1411 }
1412
1413 static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
1414 {
1415         int error = -EACCES;
1416         struct inode *inode = dentry->d_inode;
1417         struct path path;
1418
1419         /* Are we allowed to snoop on the tasks file descriptors? */
1420         if (!proc_fd_access_allowed(inode))
1421                 goto out;
1422
1423         error = PROC_I(inode)->op.proc_get_link(inode, &path);
1424         if (error)
1425                 goto out;
1426
1427         error = do_proc_readlink(&path, buffer, buflen);
1428         path_put(&path);
1429 out:
1430         return error;
1431 }
1432
1433 static const struct inode_operations proc_pid_link_inode_operations = {
1434         .readlink       = proc_pid_readlink,
1435         .follow_link    = proc_pid_follow_link,
1436         .setattr        = proc_setattr,
1437 };
1438
1439
1440 /* building an inode */
1441
1442 static int task_dumpable(struct task_struct *task)
1443 {
1444         int dumpable = 0;
1445         struct mm_struct *mm;
1446
1447         task_lock(task);
1448         mm = task->mm;
1449         if (mm)
1450                 dumpable = get_dumpable(mm);
1451         task_unlock(task);
1452         if(dumpable == 1)
1453                 return 1;
1454         return 0;
1455 }
1456
1457
1458 static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
1459 {
1460         struct inode * inode;
1461         struct proc_inode *ei;
1462         const struct cred *cred;
1463
1464         /* We need a new inode */
1465
1466         inode = new_inode(sb);
1467         if (!inode)
1468                 goto out;
1469
1470         /* Common stuff */
1471         ei = PROC_I(inode);
1472         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1473         inode->i_op = &proc_def_inode_operations;
1474
1475         /*
1476          * grab the reference to task.
1477          */
1478         ei->pid = get_task_pid(task, PIDTYPE_PID);
1479         if (!ei->pid)
1480                 goto out_unlock;
1481
1482         if (task_dumpable(task)) {
1483                 rcu_read_lock();
1484                 cred = __task_cred(task);
1485                 inode->i_uid = cred->euid;
1486                 inode->i_gid = cred->egid;
1487                 rcu_read_unlock();
1488         }
1489         security_task_to_inode(task, inode);
1490
1491 out:
1492         return inode;
1493
1494 out_unlock:
1495         iput(inode);
1496         return NULL;
1497 }
1498
1499 static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1500 {
1501         struct inode *inode = dentry->d_inode;
1502         struct task_struct *task;
1503         const struct cred *cred;
1504
1505         generic_fillattr(inode, stat);
1506
1507         rcu_read_lock();
1508         stat->uid = 0;
1509         stat->gid = 0;
1510         task = pid_task(proc_pid(inode), PIDTYPE_PID);
1511         if (task) {
1512                 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
1513                     task_dumpable(task)) {
1514                         cred = __task_cred(task);
1515                         stat->uid = cred->euid;
1516                         stat->gid = cred->egid;
1517                 }
1518         }
1519         rcu_read_unlock();
1520         return 0;
1521 }
1522
1523 /* dentry stuff */
1524
1525 /*
1526  *      Exceptional case: normally we are not allowed to unhash a busy
1527  * directory. In this case, however, we can do it - no aliasing problems
1528  * due to the way we treat inodes.
1529  *
1530  * Rewrite the inode's ownerships here because the owning task may have
1531  * performed a setuid(), etc.
1532  *
1533  * Before the /proc/pid/status file was created the only way to read
1534  * the effective uid of a /process was to stat /proc/pid.  Reading
1535  * /proc/pid/status is slow enough that procps and other packages
1536  * kept stating /proc/pid.  To keep the rules in /proc simple I have
1537  * made this apply to all per process world readable and executable
1538  * directories.
1539  */
1540 static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
1541 {
1542         struct inode *inode = dentry->d_inode;
1543         struct task_struct *task = get_proc_task(inode);
1544         const struct cred *cred;
1545
1546         if (task) {
1547                 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
1548                     task_dumpable(task)) {
1549                         rcu_read_lock();
1550                         cred = __task_cred(task);
1551                         inode->i_uid = cred->euid;
1552                         inode->i_gid = cred->egid;
1553                         rcu_read_unlock();
1554                 } else {
1555                         inode->i_uid = 0;
1556                         inode->i_gid = 0;
1557                 }
1558                 inode->i_mode &= ~(S_ISUID | S_ISGID);
1559                 security_task_to_inode(task, inode);
1560                 put_task_struct(task);
1561                 return 1;
1562         }
1563         d_drop(dentry);
1564         return 0;
1565 }
1566
1567 static int pid_delete_dentry(struct dentry * dentry)
1568 {
1569         /* Is the task we represent dead?
1570          * If so, then don't put the dentry on the lru list,
1571          * kill it immediately.
1572          */
1573         return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
1574 }
1575
1576 static const struct dentry_operations pid_dentry_operations =
1577 {
1578         .d_revalidate   = pid_revalidate,
1579         .d_delete       = pid_delete_dentry,
1580 };
1581
1582 /* Lookups */
1583
1584 typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
1585                                 struct task_struct *, const void *);
1586
1587 /*
1588  * Fill a directory entry.
1589  *
1590  * If possible create the dcache entry and derive our inode number and
1591  * file type from dcache entry.
1592  *
1593  * Since all of the proc inode numbers are dynamically generated, the inode
1594  * numbers do not exist until the inode is cache.  This means creating the
1595  * the dcache entry in readdir is necessary to keep the inode numbers
1596  * reported by readdir in sync with the inode numbers reported
1597  * by stat.
1598  */
1599 static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
1600         char *name, int len,
1601         instantiate_t instantiate, struct task_struct *task, const void *ptr)
1602 {
1603         struct dentry *child, *dir = filp->f_path.dentry;
1604         struct inode *inode;
1605         struct qstr qname;
1606         ino_t ino = 0;
1607         unsigned type = DT_UNKNOWN;
1608
1609         qname.name = name;
1610         qname.len  = len;
1611         qname.hash = full_name_hash(name, len);
1612
1613         child = d_lookup(dir, &qname);
1614         if (!child) {
1615                 struct dentry *new;
1616                 new = d_alloc(dir, &qname);
1617                 if (new) {
1618                         child = instantiate(dir->d_inode, new, task, ptr);
1619                         if (child)
1620                                 dput(new);
1621                         else
1622                                 child = new;
1623                 }
1624         }
1625         if (!child || IS_ERR(child) || !child->d_inode)
1626                 goto end_instantiate;
1627         inode = child->d_inode;
1628         if (inode) {
1629                 ino = inode->i_ino;
1630                 type = inode->i_mode >> 12;
1631         }
1632         dput(child);
1633 end_instantiate:
1634         if (!ino)
1635                 ino = find_inode_number(dir, &qname);
1636         if (!ino)
1637                 ino = 1;
1638         return filldir(dirent, name, len, filp->f_pos, ino, type);
1639 }
1640
1641 static unsigned name_to_int(struct dentry *dentry)
1642 {
1643         const char *name = dentry->d_name.name;
1644         int len = dentry->d_name.len;
1645         unsigned n = 0;
1646
1647         if (len > 1 && *name == '0')
1648                 goto out;
1649         while (len-- > 0) {
1650                 unsigned c = *name++ - '0';
1651                 if (c > 9)
1652                         goto out;
1653                 if (n >= (~0U-9)/10)
1654                         goto out;
1655                 n *= 10;
1656                 n += c;
1657         }
1658         return n;
1659 out:
1660         return ~0U;
1661 }
1662
1663 #define PROC_FDINFO_MAX 64
1664
1665 static int proc_fd_info(struct inode *inode, struct path *path, char *info)
1666 {
1667         struct task_struct *task = get_proc_task(inode);
1668         struct files_struct *files = NULL;
1669         struct file *file;
1670         int fd = proc_fd(inode);
1671
1672         if (task) {
1673                 files = get_files_struct(task);
1674                 put_task_struct(task);
1675         }
1676         if (files) {
1677                 /*
1678                  * We are not taking a ref to the file structure, so we must
1679                  * hold ->file_lock.
1680                  */
1681                 spin_lock(&files->file_lock);
1682                 file = fcheck_files(files, fd);
1683                 if (file) {
1684                         if (path) {
1685                                 *path = file->f_path;
1686                                 path_get(&file->f_path);
1687                         }
1688                         if (info)
1689                                 snprintf(info, PROC_FDINFO_MAX,
1690                                          "pos:\t%lli\n"
1691                                          "flags:\t0%o\n",
1692                                          (long long) file->f_pos,
1693                                          file->f_flags);
1694                         spin_unlock(&files->file_lock);
1695                         put_files_struct(files);
1696                         return 0;
1697                 }
1698                 spin_unlock(&files->file_lock);
1699                 put_files_struct(files);
1700         }
1701         return -ENOENT;
1702 }
1703
1704 static int proc_fd_link(struct inode *inode, struct path *path)
1705 {
1706         return proc_fd_info(inode, path, NULL);
1707 }
1708
1709 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
1710 {
1711         struct inode *inode = dentry->d_inode;
1712         struct task_struct *task = get_proc_task(inode);
1713         int fd = proc_fd(inode);
1714         struct files_struct *files;
1715         const struct cred *cred;
1716
1717         if (task) {
1718                 files = get_files_struct(task);
1719                 if (files) {
1720                         rcu_read_lock();
1721                         if (fcheck_files(files, fd)) {
1722                                 rcu_read_unlock();
1723                                 put_files_struct(files);
1724                                 if (task_dumpable(task)) {
1725                                         rcu_read_lock();
1726                                         cred = __task_cred(task);
1727                                         inode->i_uid = cred->euid;
1728                                         inode->i_gid = cred->egid;
1729                                         rcu_read_unlock();
1730                                 } else {
1731                                         inode->i_uid = 0;
1732                                         inode->i_gid = 0;
1733                                 }
1734                                 inode->i_mode &= ~(S_ISUID | S_ISGID);
1735                                 security_task_to_inode(task, inode);
1736                                 put_task_struct(task);
1737                                 return 1;
1738                         }
1739                         rcu_read_unlock();
1740                         put_files_struct(files);
1741                 }
1742                 put_task_struct(task);
1743         }
1744         d_drop(dentry);
1745         return 0;
1746 }
1747
1748 static const struct dentry_operations tid_fd_dentry_operations =
1749 {
1750         .d_revalidate   = tid_fd_revalidate,
1751         .d_delete       = pid_delete_dentry,
1752 };
1753
1754 static struct dentry *proc_fd_instantiate(struct inode *dir,
1755         struct dentry *dentry, struct task_struct *task, const void *ptr)
1756 {
1757         unsigned fd = *(const unsigned *)ptr;
1758         struct file *file;
1759         struct files_struct *files;
1760         struct inode *inode;
1761         struct proc_inode *ei;
1762         struct dentry *error = ERR_PTR(-ENOENT);
1763
1764         inode = proc_pid_make_inode(dir->i_sb, task);
1765         if (!inode)
1766                 goto out;
1767         ei = PROC_I(inode);
1768         ei->fd = fd;
1769         files = get_files_struct(task);
1770         if (!files)
1771                 goto out_iput;
1772         inode->i_mode = S_IFLNK;
1773
1774         /*
1775          * We are not taking a ref to the file structure, so we must
1776          * hold ->file_lock.
1777          */
1778         spin_lock(&files->file_lock);
1779         file = fcheck_files(files, fd);
1780         if (!file)
1781                 goto out_unlock;
1782         if (file->f_mode & FMODE_READ)
1783                 inode->i_mode |= S_IRUSR | S_IXUSR;
1784         if (file->f_mode & FMODE_WRITE)
1785                 inode->i_mode |= S_IWUSR | S_IXUSR;
1786         spin_unlock(&files->file_lock);
1787         put_files_struct(files);
1788
1789         inode->i_op = &proc_pid_link_inode_operations;
1790         inode->i_size = 64;
1791         ei->op.proc_get_link = proc_fd_link;
1792         dentry->d_op = &tid_fd_dentry_operations;
1793         d_add(dentry, inode);
1794         /* Close the race of the process dying before we return the dentry */
1795         if (tid_fd_revalidate(dentry, NULL))
1796                 error = NULL;
1797
1798  out:
1799         return error;
1800 out_unlock:
1801         spin_unlock(&files->file_lock);
1802         put_files_struct(files);
1803 out_iput:
1804         iput(inode);
1805         goto out;
1806 }
1807
1808 static struct dentry *proc_lookupfd_common(struct inode *dir,
1809                                            struct dentry *dentry,
1810                                            instantiate_t instantiate)
1811 {
1812         struct task_struct *task = get_proc_task(dir);
1813         unsigned fd = name_to_int(dentry);
1814         struct dentry *result = ERR_PTR(-ENOENT);
1815
1816         if (!task)
1817                 goto out_no_task;
1818         if (fd == ~0U)
1819                 goto out;
1820
1821         result = instantiate(dir, dentry, task, &fd);
1822 out:
1823         put_task_struct(task);
1824 out_no_task:
1825         return result;
1826 }
1827
1828 static int proc_readfd_common(struct file * filp, void * dirent,
1829                               filldir_t filldir, instantiate_t instantiate)
1830 {
1831         struct dentry *dentry = filp->f_path.dentry;
1832         struct inode *inode = dentry->d_inode;
1833         struct task_struct *p = get_proc_task(inode);
1834         unsigned int fd, ino;
1835         int retval;
1836         struct files_struct * files;
1837
1838         retval = -ENOENT;
1839         if (!p)
1840                 goto out_no_task;
1841         retval = 0;
1842
1843         fd = filp->f_pos;
1844         switch (fd) {
1845                 case 0:
1846                         if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
1847                                 goto out;
1848                         filp->f_pos++;
1849                 case 1:
1850                         ino = parent_ino(dentry);
1851                         if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
1852                                 goto out;
1853                         filp->f_pos++;
1854                 default:
1855                         files = get_files_struct(p);
1856                         if (!files)
1857                                 goto out;
1858                         rcu_read_lock();
1859                         for (fd = filp->f_pos-2;
1860                              fd < files_fdtable(files)->max_fds;
1861                              fd++, filp->f_pos++) {
1862                                 char name[PROC_NUMBUF];
1863                                 int len;
1864
1865                                 if (!fcheck_files(files, fd))
1866                                         continue;
1867                                 rcu_read_unlock();
1868
1869                                 len = snprintf(name, sizeof(name), "%d", fd);
1870                                 if (proc_fill_cache(filp, dirent, filldir,
1871                                                     name, len, instantiate,
1872                                                     p, &fd) < 0) {
1873                                         rcu_read_lock();
1874                                         break;
1875                                 }
1876                                 rcu_read_lock();
1877                         }
1878                         rcu_read_unlock();
1879                         put_files_struct(files);
1880         }
1881 out:
1882         put_task_struct(p);
1883 out_no_task:
1884         return retval;
1885 }
1886
1887 static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
1888                                     struct nameidata *nd)
1889 {
1890         return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
1891 }
1892
1893 static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
1894 {
1895         return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
1896 }
1897
1898 static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
1899                                       size_t len, loff_t *ppos)
1900 {
1901         char tmp[PROC_FDINFO_MAX];
1902         int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp);
1903         if (!err)
1904                 err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp));
1905         return err;
1906 }
1907
1908 static const struct file_operations proc_fdinfo_file_operations = {
1909         .open           = nonseekable_open,
1910         .read           = proc_fdinfo_read,
1911 };
1912
1913 static const struct file_operations proc_fd_operations = {
1914         .read           = generic_read_dir,
1915         .readdir        = proc_readfd,
1916 };
1917
1918 /*
1919  * /proc/pid/fd needs a special permission handler so that a process can still
1920  * access /proc/self/fd after it has executed a setuid().
1921  */
1922 static int proc_fd_permission(struct inode *inode, int mask)
1923 {
1924         int rv;
1925
1926         rv = generic_permission(inode, mask, NULL);
1927         if (rv == 0)
1928                 return 0;
1929         if (task_pid(current) == proc_pid(inode))
1930                 rv = 0;
1931         return rv;
1932 }
1933
1934 /*
1935  * proc directories can do almost nothing..
1936  */
1937 static const struct inode_operations proc_fd_inode_operations = {
1938         .lookup         = proc_lookupfd,
1939         .permission     = proc_fd_permission,
1940         .setattr        = proc_setattr,
1941 };
1942
1943 static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
1944         struct dentry *dentry, struct task_struct *task, const void *ptr)
1945 {
1946         unsigned fd = *(unsigned *)ptr;
1947         struct inode *inode;
1948         struct proc_inode *ei;
1949         struct dentry *error = ERR_PTR(-ENOENT);
1950
1951         inode = proc_pid_make_inode(dir->i_sb, task);
1952         if (!inode)
1953                 goto out;
1954         ei = PROC_I(inode);
1955         ei->fd = fd;
1956         inode->i_mode = S_IFREG | S_IRUSR;
1957         inode->i_fop = &proc_fdinfo_file_operations;
1958         dentry->d_op = &tid_fd_dentry_operations;
1959         d_add(dentry, inode);
1960         /* Close the race of the process dying before we return the dentry */
1961         if (tid_fd_revalidate(dentry, NULL))
1962                 error = NULL;
1963
1964  out:
1965         return error;
1966 }
1967
1968 static struct dentry *proc_lookupfdinfo(struct inode *dir,
1969                                         struct dentry *dentry,
1970                                         struct nameidata *nd)
1971 {
1972         return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
1973 }
1974
1975 static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
1976 {
1977         return proc_readfd_common(filp, dirent, filldir,
1978                                   proc_fdinfo_instantiate);
1979 }
1980
1981 static const struct file_operations proc_fdinfo_operations = {
1982         .read           = generic_read_dir,
1983         .readdir        = proc_readfdinfo,
1984 };
1985
1986 /*
1987  * proc directories can do almost nothing..
1988  */
1989 static const struct inode_operations proc_fdinfo_inode_operations = {
1990         .lookup         = proc_lookupfdinfo,
1991         .setattr        = proc_setattr,
1992 };
1993
1994
1995 static struct dentry *proc_pident_instantiate(struct inode *dir,
1996         struct dentry *dentry, struct task_struct *task, const void *ptr)
1997 {
1998         const struct pid_entry *p = ptr;
1999         struct inode *inode;
2000         struct proc_inode *ei;
2001         struct dentry *error = ERR_PTR(-ENOENT);
2002
2003         inode = proc_pid_make_inode(dir->i_sb, task);
2004         if (!inode)
2005                 goto out;
2006
2007         ei = PROC_I(inode);
2008         inode->i_mode = p->mode;
2009         if (S_ISDIR(inode->i_mode))
2010                 inode->i_nlink = 2;     /* Use getattr to fix if necessary */
2011         if (p->iop)
2012                 inode->i_op = p->iop;
2013         if (p->fop)
2014                 inode->i_fop = p->fop;
2015         ei->op = p->op;
2016         dentry->d_op = &pid_dentry_operations;
2017         d_add(dentry, inode);
2018         /* Close the race of the process dying before we return the dentry */
2019         if (pid_revalidate(dentry, NULL))
2020                 error = NULL;
2021 out:
2022         return error;
2023 }
2024
2025 static struct dentry *proc_pident_lookup(struct inode *dir,
2026                                          struct dentry *dentry,
2027                                          const struct pid_entry *ents,
2028                                          unsigned int nents)
2029 {
2030         struct dentry *error;
2031         struct task_struct *task = get_proc_task(dir);
2032         const struct pid_entry *p, *last;
2033
2034         error = ERR_PTR(-ENOENT);
2035
2036         if (!task)
2037                 goto out_no_task;
2038
2039         /*
2040          * Yes, it does not scale. And it should not. Don't add
2041          * new entries into /proc/<tgid>/ without very good reasons.
2042          */
2043         last = &ents[nents - 1];
2044         for (p = ents; p <= last; p++) {
2045                 if (p->len != dentry->d_name.len)
2046                         continue;
2047                 if (!memcmp(dentry->d_name.name, p->name, p->len))
2048                         break;
2049         }
2050         if (p > last)
2051                 goto out;
2052
2053         error = proc_pident_instantiate(dir, dentry, task, p);
2054 out:
2055         put_task_struct(task);
2056 out_no_task:
2057         return error;
2058 }
2059
2060 static int proc_pident_fill_cache(struct file *filp, void *dirent,
2061         filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
2062 {
2063         return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
2064                                 proc_pident_instantiate, task, p);
2065 }
2066
2067 static int proc_pident_readdir(struct file *filp,
2068                 void *dirent, filldir_t filldir,
2069                 const struct pid_entry *ents, unsigned int nents)
2070 {
2071         int i;
2072         struct dentry *dentry = filp->f_path.dentry;
2073         struct inode *inode = dentry->d_inode;
2074         struct task_struct *task = get_proc_task(inode);
2075         const struct pid_entry *p, *last;
2076         ino_t ino;
2077         int ret;
2078
2079         ret = -ENOENT;
2080         if (!task)
2081                 goto out_no_task;
2082
2083         ret = 0;
2084         i = filp->f_pos;
2085         switch (i) {
2086         case 0:
2087                 ino = inode->i_ino;
2088                 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
2089                         goto out;
2090                 i++;
2091                 filp->f_pos++;
2092                 /* fall through */
2093         case 1:
2094                 ino = parent_ino(dentry);
2095                 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
2096                         goto out;
2097                 i++;
2098                 filp->f_pos++;
2099                 /* fall through */
2100         default:
2101                 i -= 2;
2102                 if (i >= nents) {
2103                         ret = 1;
2104                         goto out;
2105                 }
2106                 p = ents + i;
2107                 last = &ents[nents - 1];
2108                 while (p <= last) {
2109                         if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0)
2110                                 goto out;
2111                         filp->f_pos++;
2112                         p++;
2113                 }
2114         }
2115
2116         ret = 1;
2117 out:
2118         put_task_struct(task);
2119 out_no_task:
2120         return ret;
2121 }
2122
2123 #ifdef CONFIG_SECURITY
2124 static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
2125                                   size_t count, loff_t *ppos)
2126 {
2127         struct inode * inode = file->f_path.dentry->d_inode;
2128         char *p = NULL;
2129         ssize_t length;
2130         struct task_struct *task = get_proc_task(inode);
2131
2132         if (!task)
2133                 return -ESRCH;
2134
2135         length = security_getprocattr(task,
2136                                       (char*)file->f_path.dentry->d_name.name,
2137                                       &p);
2138         put_task_struct(task);
2139         if (length > 0)
2140                 length = simple_read_from_buffer(buf, count, ppos, p, length);
2141         kfree(p);
2142         return length;
2143 }
2144
2145 static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
2146                                    size_t count, loff_t *ppos)
2147 {
2148         struct inode * inode = file->f_path.dentry->d_inode;
2149         char *page;
2150         ssize_t length;
2151         struct task_struct *task = get_proc_task(inode);
2152
2153         length = -ESRCH;
2154         if (!task)
2155                 goto out_no_task;
2156         if (count > PAGE_SIZE)
2157                 count = PAGE_SIZE;
2158
2159         /* No partial writes. */
2160         length = -EINVAL;
2161         if (*ppos != 0)
2162                 goto out;
2163
2164         length = -ENOMEM;
2165         page = (char*)__get_free_page(GFP_TEMPORARY);
2166         if (!page)
2167                 goto out;
2168
2169         length = -EFAULT;
2170         if (copy_from_user(page, buf, count))
2171                 goto out_free;
2172
2173         /* Guard against adverse ptrace interaction */
2174         length = mutex_lock_interruptible(&task->cred_guard_mutex);
2175         if (length < 0)
2176                 goto out_free;
2177
2178         length = security_setprocattr(task,
2179                                       (char*)file->f_path.dentry->d_name.name,
2180                                       (void*)page, count);
2181         mutex_unlock(&task->cred_guard_mutex);
2182 out_free:
2183         free_page((unsigned long) page);
2184 out:
2185         put_task_struct(task);
2186 out_no_task:
2187         return length;
2188 }
2189
2190 static const struct file_operations proc_pid_attr_operations = {
2191         .read           = proc_pid_attr_read,
2192         .write          = proc_pid_attr_write,
2193 };
2194
2195 static const struct pid_entry attr_dir_stuff[] = {
2196         REG("current",    S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2197         REG("prev",       S_IRUGO,         proc_pid_attr_operations),
2198         REG("exec",       S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2199         REG("fscreate",   S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2200         REG("keycreate",  S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2201         REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations),
2202 };
2203
2204 static int proc_attr_dir_readdir(struct file * filp,
2205                              void * dirent, filldir_t filldir)
2206 {
2207         return proc_pident_readdir(filp,dirent,filldir,
2208                                    attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff));
2209 }
2210
2211 static const struct file_operations proc_attr_dir_operations = {
2212         .read           = generic_read_dir,
2213         .readdir        = proc_attr_dir_readdir,
2214 };
2215
2216 static struct dentry *proc_attr_dir_lookup(struct inode *dir,
2217                                 struct dentry *dentry, struct nameidata *nd)
2218 {
2219         return proc_pident_lookup(dir, dentry,
2220                                   attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
2221 }
2222
2223 static const struct inode_operations proc_attr_dir_inode_operations = {
2224         .lookup         = proc_attr_dir_lookup,
2225         .getattr        = pid_getattr,
2226         .setattr        = proc_setattr,
2227 };
2228
2229 #endif
2230
2231 #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
2232 static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
2233                                          size_t count, loff_t *ppos)
2234 {
2235         struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
2236         struct mm_struct *mm;
2237         char buffer[PROC_NUMBUF];
2238         size_t len;
2239         int ret;
2240
2241         if (!task)
2242                 return -ESRCH;
2243
2244         ret = 0;
2245         mm = get_task_mm(task);
2246         if (mm) {
2247                 len = snprintf(buffer, sizeof(buffer), "%08lx\n",
2248                                ((mm->flags & MMF_DUMP_FILTER_MASK) >>
2249                                 MMF_DUMP_FILTER_SHIFT));
2250                 mmput(mm);
2251                 ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
2252         }
2253
2254         put_task_struct(task);
2255
2256         return ret;
2257 }
2258
2259 static ssize_t proc_coredump_filter_write(struct file *file,
2260                                           const char __user *buf,
2261                                           size_t count,
2262                                           loff_t *ppos)
2263 {
2264         struct task_struct *task;
2265         struct mm_struct *mm;
2266         char buffer[PROC_NUMBUF], *end;
2267         unsigned int val;
2268         int ret;
2269         int i;
2270         unsigned long mask;
2271
2272         ret = -EFAULT;
2273         memset(buffer, 0, sizeof(buffer));
2274         if (count > sizeof(buffer) - 1)
2275                 count = sizeof(buffer) - 1;
2276         if (copy_from_user(buffer, buf, count))
2277                 goto out_no_task;
2278
2279         ret = -EINVAL;
2280         val = (unsigned int)simple_strtoul(buffer, &end, 0);
2281         if (*end == '\n')
2282                 end++;
2283         if (end - buffer == 0)
2284                 goto out_no_task;
2285
2286         ret = -ESRCH;
2287         task = get_proc_task(file->f_dentry->d_inode);
2288         if (!task)
2289                 goto out_no_task;
2290
2291         ret = end - buffer;
2292         mm = get_task_mm(task);
2293         if (!mm)
2294                 goto out_no_mm;
2295
2296         for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
2297                 if (val & mask)
2298                         set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
2299                 else
2300                         clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
2301         }
2302
2303         mmput(mm);
2304  out_no_mm:
2305         put_task_struct(task);
2306  out_no_task:
2307         return ret;
2308 }
2309
2310 static const struct file_operations proc_coredump_filter_operations = {
2311         .read           = proc_coredump_filter_read,
2312         .write          = proc_coredump_filter_write,
2313 };
2314 #endif
2315
2316 /*
2317  * /proc/self:
2318  */
2319 static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
2320                               int buflen)
2321 {
2322         struct pid_namespace *ns = dentry->d_sb->s_fs_info;
2323         pid_t tgid = task_tgid_nr_ns(current, ns);
2324         char tmp[PROC_NUMBUF];
2325         if (!tgid)
2326                 return -ENOENT;
2327         sprintf(tmp, "%d", tgid);
2328         return vfs_readlink(dentry,buffer,buflen,tmp);
2329 }
2330
2331 static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
2332 {
2333         struct pid_namespace *ns = dentry->d_sb->s_fs_info;
2334         pid_t tgid = task_tgid_nr_ns(current, ns);
2335         char *name = ERR_PTR(-ENOENT);
2336         if (tgid) {
2337                 name = __getname();
2338                 if (!name)
2339                         name = ERR_PTR(-ENOMEM);
2340                 else
2341                         sprintf(name, "%d", tgid);
2342         }
2343         nd_set_link(nd, name);
2344         return NULL;
2345 }
2346
2347 static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd,
2348                                 void *cookie)
2349 {
2350         char *s = nd_get_link(nd);
2351         if (!IS_ERR(s))
2352                 __putname(s);
2353 }
2354
2355 static const struct inode_operations proc_self_inode_operations = {
2356         .readlink       = proc_self_readlink,
2357         .follow_link    = proc_self_follow_link,
2358         .put_link       = proc_self_put_link,
2359 };
2360
2361 /*
2362  * proc base
2363  *
2364  * These are the directory entries in the root directory of /proc
2365  * that properly belong to the /proc filesystem, as they describe
2366  * describe something that is process related.
2367  */
2368 static const struct pid_entry proc_base_stuff[] = {
2369         NOD("self", S_IFLNK|S_IRWXUGO,
2370                 &proc_self_inode_operations, NULL, {}),
2371 };
2372
2373 /*
2374  *      Exceptional case: normally we are not allowed to unhash a busy
2375  * directory. In this case, however, we can do it - no aliasing problems
2376  * due to the way we treat inodes.
2377  */
2378 static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
2379 {
2380         struct inode *inode = dentry->d_inode;
2381         struct task_struct *task = get_proc_task(inode);
2382         if (task) {
2383                 put_task_struct(task);
2384                 return 1;
2385         }
2386         d_drop(dentry);
2387         return 0;
2388 }
2389
2390 static const struct dentry_operations proc_base_dentry_operations =
2391 {
2392         .d_revalidate   = proc_base_revalidate,
2393         .d_delete       = pid_delete_dentry,
2394 };
2395
2396 static struct dentry *proc_base_instantiate(struct inode *dir,
2397         struct dentry *dentry, struct task_struct *task, const void *ptr)
2398 {
2399         const struct pid_entry *p = ptr;
2400         struct inode *inode;
2401         struct proc_inode *ei;
2402         struct dentry *error = ERR_PTR(-EINVAL);
2403
2404         /* Allocate the inode */
2405         error = ERR_PTR(-ENOMEM);
2406         inode = new_inode(dir->i_sb);
2407         if (!inode)
2408                 goto out;
2409
2410         /* Initialize the inode */
2411         ei = PROC_I(inode);
2412         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
2413
2414         /*
2415          * grab the reference to the task.
2416          */
2417         ei->pid = get_task_pid(task, PIDTYPE_PID);
2418         if (!ei->pid)
2419                 goto out_iput;
2420
2421         inode->i_mode = p->mode;
2422         if (S_ISDIR(inode->i_mode))
2423                 inode->i_nlink = 2;
2424         if (S_ISLNK(inode->i_mode))
2425                 inode->i_size = 64;
2426         if (p->iop)
2427                 inode->i_op = p->iop;
2428         if (p->fop)
2429                 inode->i_fop = p->fop;
2430         ei->op = p->op;
2431         dentry->d_op = &proc_base_dentry_operations;
2432         d_add(dentry, inode);
2433         error = NULL;
2434 out:
2435         return error;
2436 out_iput:
2437         iput(inode);
2438         goto out;
2439 }
2440
2441 static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry)
2442 {
2443         struct dentry *error;
2444         struct task_struct *task = get_proc_task(dir);
2445         const struct pid_entry *p, *last;
2446
2447         error = ERR_PTR(-ENOENT);
2448
2449         if (!task)
2450                 goto out_no_task;
2451
2452         /* Lookup the directory entry */
2453         last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1];
2454         for (p = proc_base_stuff; p <= last; p++) {
2455                 if (p->len != dentry->d_name.len)
2456                         continue;
2457                 if (!memcmp(dentry->d_name.name, p->name, p->len))
2458                         break;
2459         }
2460         if (p > last)
2461                 goto out;
2462
2463         error = proc_base_instantiate(dir, dentry, task, p);
2464
2465 out:
2466         put_task_struct(task);
2467 out_no_task:
2468         return error;
2469 }
2470
2471 static int proc_base_fill_cache(struct file *filp, void *dirent,
2472         filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
2473 {
2474         return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
2475                                 proc_base_instantiate, task, p);
2476 }
2477
2478 #ifdef CONFIG_TASK_IO_ACCOUNTING
2479 static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
2480 {
2481         struct task_io_accounting acct = task->ioac;
2482         unsigned long flags;
2483
2484         if (!ptrace_may_access(task, PTRACE_MODE_READ))
2485                 return -EACCES;
2486
2487         if (whole && lock_task_sighand(task, &flags)) {
2488                 struct task_struct *t = task;
2489
2490                 task_io_accounting_add(&acct, &task->signal->ioac);
2491                 while_each_thread(task, t)
2492                         task_io_accounting_add(&acct, &t->ioac);
2493
2494                 unlock_task_sighand(task, &flags);
2495         }
2496         return sprintf(buffer,
2497                         "rchar: %llu\n"
2498                         "wchar: %llu\n"
2499                         "syscr: %llu\n"
2500                         "syscw: %llu\n"
2501                         "read_bytes: %llu\n"
2502                         "write_bytes: %llu\n"
2503                         "cancelled_write_bytes: %llu\n",
2504                         (unsigned long long)acct.rchar,
2505                         (unsigned long long)acct.wchar,
2506                         (unsigned long long)acct.syscr,
2507                         (unsigned long long)acct.syscw,
2508                         (unsigned long long)acct.read_bytes,
2509                         (unsigned long long)acct.write_bytes,
2510                         (unsigned long long)acct.cancelled_write_bytes);
2511 }
2512
2513 static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
2514 {
2515         return do_io_accounting(task, buffer, 0);
2516 }
2517
2518 static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
2519 {
2520         return do_io_accounting(task, buffer, 1);
2521 }
2522 #endif /* CONFIG_TASK_IO_ACCOUNTING */
2523
2524 static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns,
2525                                 struct pid *pid, struct task_struct *task)
2526 {
2527         int err = lock_trace(task);
2528         if (!err) {
2529                 seq_printf(m, "%08x\n", task->personality);
2530                 unlock_trace(task);
2531         }
2532         return err;
2533 }
2534
2535 /*
2536  * Thread groups
2537  */
2538 static const struct file_operations proc_task_operations;
2539 static const struct inode_operations proc_task_inode_operations;
2540
2541 static const struct pid_entry tgid_base_stuff[] = {
2542         DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
2543         DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2544         DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
2545 #ifdef CONFIG_NET
2546         DIR("net",        S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations),
2547 #endif
2548         REG("environ",    S_IRUSR, proc_environ_operations),
2549         INF("auxv",       S_IRUSR, proc_pid_auxv),
2550         ONE("status",     S_IRUGO, proc_pid_status),
2551         ONE("personality", S_IRUGO, proc_pid_personality),
2552         INF("limits",     S_IRUSR, proc_pid_limits),
2553 #ifdef CONFIG_SCHED_DEBUG
2554         REG("sched",      S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2555 #endif
2556 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2557         INF("syscall",    S_IRUGO, proc_pid_syscall),
2558 #endif
2559         INF("cmdline",    S_IRUGO, proc_pid_cmdline),
2560         ONE("stat",       S_IRUGO, proc_tgid_stat),
2561         ONE("statm",      S_IRUGO, proc_pid_statm),
2562         REG("maps",       S_IRUGO, proc_maps_operations),
2563 #ifdef CONFIG_NUMA
2564         REG("numa_maps",  S_IRUGO, proc_numa_maps_operations),
2565 #endif
2566         REG("mem",        S_IRUSR|S_IWUSR, proc_mem_operations),
2567         LNK("cwd",        proc_cwd_link),
2568         LNK("root",       proc_root_link),
2569         LNK("exe",        proc_exe_link),
2570         REG("mounts",     S_IRUGO, proc_mounts_operations),
2571         REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
2572         REG("mountstats", S_IRUSR, proc_mountstats_operations),
2573 #ifdef CONFIG_PROC_PAGE_MONITOR
2574         REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
2575         REG("smaps",      S_IRUGO, proc_smaps_operations),
2576         REG("pagemap",    S_IRUSR, proc_pagemap_operations),
2577 #endif
2578 #ifdef CONFIG_SECURITY
2579         DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
2580 #endif
2581 #ifdef CONFIG_KALLSYMS
2582         INF("wchan",      S_IRUGO, proc_pid_wchan),
2583 #endif
2584 #ifdef CONFIG_STACKTRACE
2585         ONE("stack",      S_IRUGO, proc_pid_stack),
2586 #endif
2587 #ifdef CONFIG_SCHEDSTATS
2588         INF("schedstat",  S_IRUGO, proc_pid_schedstat),
2589 #endif
2590 #ifdef CONFIG_LATENCYTOP
2591         REG("latency",  S_IRUGO, proc_lstats_operations),
2592 #endif
2593 #ifdef CONFIG_PROC_PID_CPUSET
2594         REG("cpuset",     S_IRUGO, proc_cpuset_operations),
2595 #endif
2596 #ifdef CONFIG_CGROUPS
2597         REG("cgroup",  S_IRUGO, proc_cgroup_operations),
2598 #endif
2599         INF("oom_score",  S_IRUGO, proc_oom_score),
2600         REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
2601 #ifdef CONFIG_AUDITSYSCALL
2602         REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
2603         REG("sessionid",  S_IRUGO, proc_sessionid_operations),
2604 #endif
2605 #ifdef CONFIG_FAULT_INJECTION
2606         REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
2607 #endif
2608 #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
2609         REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations),
2610 #endif
2611 #ifdef CONFIG_TASK_IO_ACCOUNTING
2612         INF("io",       S_IRUSR, proc_tgid_io_accounting),
2613 #endif
2614 };
2615
2616 static int proc_tgid_base_readdir(struct file * filp,
2617                              void * dirent, filldir_t filldir)
2618 {
2619         return proc_pident_readdir(filp,dirent,filldir,
2620                                    tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff));
2621 }
2622
2623 static const struct file_operations proc_tgid_base_operations = {
2624         .read           = generic_read_dir,
2625         .readdir        = proc_tgid_base_readdir,
2626 };
2627
2628 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
2629         return proc_pident_lookup(dir, dentry,
2630                                   tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
2631 }
2632
2633 static const struct inode_operations proc_tgid_base_inode_operations = {
2634         .lookup         = proc_tgid_base_lookup,
2635         .getattr        = pid_getattr,
2636         .setattr        = proc_setattr,
2637 };
2638
2639 static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
2640 {
2641         struct dentry *dentry, *leader, *dir;
2642         char buf[PROC_NUMBUF];
2643         struct qstr name;
2644
2645         name.name = buf;
2646         name.len = snprintf(buf, sizeof(buf), "%d", pid);
2647         dentry = d_hash_and_lookup(mnt->mnt_root, &name);
2648         if (dentry) {
2649                 shrink_dcache_parent(dentry);
2650                 d_drop(dentry);
2651                 dput(dentry);
2652         }
2653
2654         name.name = buf;
2655         name.len = snprintf(buf, sizeof(buf), "%d", tgid);
2656         leader = d_hash_and_lookup(mnt->mnt_root, &name);
2657         if (!leader)
2658                 goto out;
2659
2660         name.name = "task";
2661         name.len = strlen(name.name);
2662         dir = d_hash_and_lookup(leader, &name);
2663         if (!dir)
2664                 goto out_put_leader;
2665
2666         name.name = buf;
2667         name.len = snprintf(buf, sizeof(buf), "%d", pid);
2668         dentry = d_hash_and_lookup(dir, &name);
2669         if (dentry) {
2670                 shrink_dcache_parent(dentry);
2671                 d_drop(dentry);
2672                 dput(dentry);
2673         }
2674
2675         dput(dir);
2676 out_put_leader:
2677         dput(leader);
2678 out:
2679         return;
2680 }
2681
2682 /**
2683  * proc_flush_task -  Remove dcache entries for @task from the /proc dcache.
2684  * @task: task that should be flushed.
2685  *
2686  * When flushing dentries from proc, one needs to flush them from global
2687  * proc (proc_mnt) and from all the namespaces' procs this task was seen
2688  * in. This call is supposed to do all of this job.
2689  *
2690  * Looks in the dcache for
2691  * /proc/@pid
2692  * /proc/@tgid/task/@pid
2693  * if either directory is present flushes it and all of it'ts children
2694  * from the dcache.
2695  *
2696  * It is safe and reasonable to cache /proc entries for a task until
2697  * that task exits.  After that they just clog up the dcache with
2698  * useless entries, possibly causing useful dcache entries to be
2699  * flushed instead.  This routine is proved to flush those useless
2700  * dcache entries at process exit time.
2701  *
2702  * NOTE: This routine is just an optimization so it does not guarantee
2703  *       that no dcache entries will exist at process exit time it
2704  *       just makes it very unlikely that any will persist.
2705  */
2706
2707 void proc_flush_task(struct task_struct *task)
2708 {
2709         int i;
2710         struct pid *pid, *tgid;
2711         struct upid *upid;
2712
2713         pid = task_pid(task);
2714         tgid = task_tgid(task);
2715
2716         for (i = 0; i <= pid->level; i++) {
2717                 upid = &pid->numbers[i];
2718                 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
2719                                         tgid->numbers[i].nr);
2720         }
2721
2722         upid = &pid->numbers[pid->level];
2723         if (upid->nr == 1)
2724                 pid_ns_release_proc(upid->ns);
2725 }
2726
2727 static struct dentry *proc_pid_instantiate(struct inode *dir,
2728                                            struct dentry * dentry,
2729                                            struct task_struct *task, const void *ptr)
2730 {
2731         struct dentry *error = ERR_PTR(-ENOENT);
2732         struct inode *inode;
2733
2734         inode = proc_pid_make_inode(dir->i_sb, task);
2735         if (!inode)
2736                 goto out;
2737
2738         inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
2739         inode->i_op = &proc_tgid_base_inode_operations;
2740         inode->i_fop = &proc_tgid_base_operations;
2741         inode->i_flags|=S_IMMUTABLE;
2742
2743         inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff,
2744                 ARRAY_SIZE(tgid_base_stuff));
2745
2746         dentry->d_op = &pid_dentry_operations;
2747
2748         d_add(dentry, inode);
2749         /* Close the race of the process dying before we return the dentry */
2750         if (pid_revalidate(dentry, NULL))
2751                 error = NULL;
2752 out:
2753         return error;
2754 }
2755
2756 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
2757 {
2758         struct dentry *result = ERR_PTR(-ENOENT);
2759         struct task_struct *task;
2760         unsigned tgid;
2761         struct pid_namespace *ns;
2762
2763         result = proc_base_lookup(dir, dentry);
2764         if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT)
2765                 goto out;
2766
2767         tgid = name_to_int(dentry);
2768         if (tgid == ~0U)
2769                 goto out;
2770
2771         ns = dentry->d_sb->s_fs_info;
2772         rcu_read_lock();
2773         task = find_task_by_pid_ns(tgid, ns);
2774         if (task)
2775                 get_task_struct(task);
2776         rcu_read_unlock();
2777         if (!task)
2778                 goto out;
2779
2780         result = proc_pid_instantiate(dir, dentry, task, NULL);
2781         put_task_struct(task);
2782 out:
2783         return result;
2784 }
2785
2786 /*
2787  * Find the first task with tgid >= tgid
2788  *
2789  */
2790 struct tgid_iter {
2791         unsigned int tgid;
2792         struct task_struct *task;
2793 };
2794 static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
2795 {
2796         struct pid *pid;
2797
2798         if (iter.task)
2799                 put_task_struct(iter.task);
2800         rcu_read_lock();
2801 retry:
2802         iter.task = NULL;
2803         pid = find_ge_pid(iter.tgid, ns);
2804         if (pid) {
2805                 iter.tgid = pid_nr_ns(pid, ns);
2806                 iter.task = pid_task(pid, PIDTYPE_PID);
2807                 /* What we to know is if the pid we have find is the
2808                  * pid of a thread_group_leader.  Testing for task
2809                  * being a thread_group_leader is the obvious thing
2810                  * todo but there is a window when it fails, due to
2811                  * the pid transfer logic in de_thread.
2812                  *
2813                  * So we perform the straight forward test of seeing
2814                  * if the pid we have found is the pid of a thread
2815                  * group leader, and don't worry if the task we have
2816                  * found doesn't happen to be a thread group leader.
2817                  * As we don't care in the case of readdir.
2818                  */
2819                 if (!iter.task || !has_group_leader_pid(iter.task)) {
2820                         iter.tgid += 1;
2821                         goto retry;
2822                 }
2823                 get_task_struct(iter.task);
2824         }
2825         rcu_read_unlock();
2826         return iter;
2827 }
2828
2829 #define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff))
2830
2831 static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
2832         struct tgid_iter iter)
2833 {
2834         char name[PROC_NUMBUF];
2835         int len = snprintf(name, sizeof(name), "%d", iter.tgid);
2836         return proc_fill_cache(filp, dirent, filldir, name, len,
2837                                 proc_pid_instantiate, iter.task, NULL);
2838 }
2839
2840 /* for the /proc/ directory itself, after non-process stuff has been done */
2841 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
2842 {
2843         unsigned int nr;
2844         struct task_struct *reaper;
2845         struct tgid_iter iter;
2846         struct pid_namespace *ns;
2847
2848         if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
2849                 goto out_no_task;
2850         nr = filp->f_pos - FIRST_PROCESS_ENTRY;
2851
2852         reaper = get_proc_task(filp->f_path.dentry->d_inode);
2853         if (!reaper)
2854                 goto out_no_task;
2855
2856         for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
2857                 const struct pid_entry *p = &proc_base_stuff[nr];
2858                 if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
2859                         goto out;
2860         }
2861
2862         ns = filp->f_dentry->d_sb->s_fs_info;
2863         iter.task = NULL;
2864         iter.tgid = filp->f_pos - TGID_OFFSET;
2865         for (iter = next_tgid(ns, iter);
2866              iter.task;
2867              iter.tgid += 1, iter = next_tgid(ns, iter)) {
2868                 filp->f_pos = iter.tgid + TGID_OFFSET;
2869                 if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) {
2870                         put_task_struct(iter.task);
2871                         goto out;
2872                 }
2873         }
2874         filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
2875 out:
2876         put_task_struct(reaper);
2877 out_no_task:
2878         return 0;
2879 }
2880
2881 /*
2882  * Tasks
2883  */
2884 static const struct pid_entry tid_base_stuff[] = {
2885         DIR("fd",        S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
2886         DIR("fdinfo",    S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
2887         REG("environ",   S_IRUSR, proc_environ_operations),
2888         INF("auxv",      S_IRUSR, proc_pid_auxv),
2889         ONE("status",    S_IRUGO, proc_pid_status),
2890         ONE("personality", S_IRUGO, proc_pid_personality),
2891         INF("limits",    S_IRUSR, proc_pid_limits),
2892 #ifdef CONFIG_SCHED_DEBUG
2893         REG("sched",     S_IRUGO|S_IWUSR, proc_pid_sched_operations),
2894 #endif
2895 #ifdef CONFIG_HAVE_ARCH_TRACEHOOK
2896         INF("syscall",   S_IRUGO, proc_pid_syscall),
2897 #endif
2898         INF("cmdline",   S_IRUGO, proc_pid_cmdline),
2899         ONE("stat",      S_IRUGO, proc_tid_stat),
2900         ONE("statm",     S_IRUGO, proc_pid_statm),
2901         REG("maps",      S_IRUGO, proc_maps_operations),
2902 #ifdef CONFIG_NUMA
2903         REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
2904 #endif
2905         REG("mem",       S_IRUSR|S_IWUSR, proc_mem_operations),
2906         LNK("cwd",       proc_cwd_link),
2907         LNK("root",      proc_root_link),
2908         LNK("exe",       proc_exe_link),
2909         REG("mounts",    S_IRUGO, proc_mounts_operations),
2910         REG("mountinfo",  S_IRUGO, proc_mountinfo_operations),
2911 #ifdef CONFIG_PROC_PAGE_MONITOR
2912         REG("clear_refs", S_IWUSR, proc_clear_refs_operations),
2913         REG("smaps",     S_IRUGO, proc_smaps_operations),
2914         REG("pagemap",    S_IRUSR, proc_pagemap_operations),
2915 #endif
2916 #ifdef CONFIG_SECURITY
2917         DIR("attr",      S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
2918 #endif
2919 #ifdef CONFIG_KALLSYMS
2920         INF("wchan",     S_IRUGO, proc_pid_wchan),
2921 #endif
2922 #ifdef CONFIG_STACKTRACE
2923         ONE("stack",      S_IRUGO, proc_pid_stack),
2924 #endif
2925 #ifdef CONFIG_SCHEDSTATS
2926         INF("schedstat", S_IRUGO, proc_pid_schedstat),
2927 #endif
2928 #ifdef CONFIG_LATENCYTOP
2929         REG("latency",  S_IRUGO, proc_lstats_operations),
2930 #endif
2931 #ifdef CONFIG_PROC_PID_CPUSET
2932         REG("cpuset",    S_IRUGO, proc_cpuset_operations),
2933 #endif
2934 #ifdef CONFIG_CGROUPS
2935         REG("cgroup",  S_IRUGO, proc_cgroup_operations),
2936 #endif
2937         INF("oom_score", S_IRUGO, proc_oom_score),
2938         REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
2939 #ifdef CONFIG_AUDITSYSCALL
2940         REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
2941         REG("sessionid",  S_IRUSR, proc_sessionid_operations),
2942 #endif
2943 #ifdef CONFIG_FAULT_INJECTION
2944         REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations),
2945 #endif
2946 #ifdef CONFIG_TASK_IO_ACCOUNTING
2947         INF("io",       S_IRUSR, proc_tid_io_accounting),
2948 #endif
2949 };
2950
2951 static int proc_tid_base_readdir(struct file * filp,
2952                              void * dirent, filldir_t filldir)
2953 {
2954         return proc_pident_readdir(filp,dirent,filldir,
2955                                    tid_base_stuff,ARRAY_SIZE(tid_base_stuff));
2956 }
2957
2958 static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
2959         return proc_pident_lookup(dir, dentry,
2960                                   tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
2961 }
2962
2963 static const struct file_operations proc_tid_base_operations = {
2964         .read           = generic_read_dir,
2965         .readdir        = proc_tid_base_readdir,
2966 };
2967
2968 static const struct inode_operations proc_tid_base_inode_operations = {
2969         .lookup         = proc_tid_base_lookup,
2970         .getattr        = pid_getattr,
2971         .setattr        = proc_setattr,
2972 };
2973
2974 static struct dentry *proc_task_instantiate(struct inode *dir,
2975         struct dentry *dentry, struct task_struct *task, const void *ptr)
2976 {
2977         struct dentry *error = ERR_PTR(-ENOENT);
2978         struct inode *inode;
2979         inode = proc_pid_make_inode(dir->i_sb, task);
2980
2981         if (!inode)
2982                 goto out;
2983         inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
2984         inode->i_op = &proc_tid_base_inode_operations;
2985         inode->i_fop = &proc_tid_base_operations;
2986         inode->i_flags|=S_IMMUTABLE;
2987
2988         inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff,
2989                 ARRAY_SIZE(tid_base_stuff));
2990
2991         dentry->d_op = &pid_dentry_operations;
2992
2993         d_add(dentry, inode);
2994         /* Close the race of the process dying before we return the dentry */
2995         if (pid_revalidate(dentry, NULL))
2996                 error = NULL;
2997 out:
2998         return error;
2999 }
3000
3001 static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
3002 {
3003         struct dentry *result = ERR_PTR(-ENOENT);
3004         struct task_struct *task;
3005         struct task_struct *leader = get_proc_task(dir);
3006         unsigned tid;
3007         struct pid_namespace *ns;
3008
3009         if (!leader)
3010                 goto out_no_task;
3011
3012         tid = name_to_int(dentry);
3013         if (tid == ~0U)
3014                 goto out;
3015
3016         ns = dentry->d_sb->s_fs_info;
3017         rcu_read_lock();
3018         task = find_task_by_pid_ns(tid, ns);
3019         if (task)
3020                 get_task_struct(task);
3021         rcu_read_unlock();
3022         if (!task)
3023                 goto out;
3024         if (!same_thread_group(leader, task))
3025                 goto out_drop_task;
3026
3027         result = proc_task_instantiate(dir, dentry, task, NULL);
3028 out_drop_task:
3029         put_task_struct(task);
3030 out:
3031         put_task_struct(leader);
3032 out_no_task:
3033         return result;
3034 }
3035
3036 /*
3037  * Find the first tid of a thread group to return to user space.
3038  *
3039  * Usually this is just the thread group leader, but if the users
3040  * buffer was too small or there was a seek into the middle of the
3041  * directory we have more work todo.
3042  *
3043  * In the case of a short read we start with find_task_by_pid.
3044  *
3045  * In the case of a seek we start with the leader and walk nr
3046  * threads past it.
3047  */
3048 static struct task_struct *first_tid(struct task_struct *leader,
3049                 int tid, int nr, struct pid_namespace *ns)
3050 {
3051         struct task_struct *pos;
3052
3053         rcu_read_lock();
3054         /* Attempt to start with the pid of a thread */
3055         if (tid && (nr > 0)) {
3056                 pos = find_task_by_pid_ns(tid, ns);
3057                 if (pos && (pos->group_leader == leader))
3058                         goto found;
3059         }
3060
3061         /* If nr exceeds the number of threads there is nothing todo */
3062         pos = NULL;
3063         if (nr && nr >= get_nr_threads(leader))
3064                 goto out;
3065
3066         /* If we haven't found our starting place yet start
3067          * with the leader and walk nr threads forward.
3068          */
3069         for (pos = leader; nr > 0; --nr) {
3070                 pos = next_thread(pos);
3071                 if (pos == leader) {
3072                         pos = NULL;
3073                         goto out;
3074                 }
3075         }
3076 found:
3077         get_task_struct(pos);
3078 out:
3079         rcu_read_unlock();
3080         return pos;
3081 }
3082
3083 /*
3084  * Find the next thread in the thread list.
3085  * Return NULL if there is an error or no next thread.
3086  *
3087  * The reference to the input task_struct is released.
3088  */
3089 static struct task_struct *next_tid(struct task_struct *start)
3090 {
3091         struct task_struct *pos = NULL;
3092         rcu_read_lock();
3093         if (pid_alive(start)) {
3094                 pos = next_thread(start);
3095                 if (thread_group_leader(pos))
3096                         pos = NULL;
3097                 else
3098                         get_task_struct(pos);
3099         }
3100         rcu_read_unlock();
3101         put_task_struct(start);
3102         return pos;
3103 }
3104
3105 static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
3106         struct task_struct *task, int tid)
3107 {
3108         char name[PROC_NUMBUF];
3109         int len = snprintf(name, sizeof(name), "%d", tid);
3110         return proc_fill_cache(filp, dirent, filldir, name, len,
3111                                 proc_task_instantiate, task, NULL);
3112 }
3113
3114 /* for the /proc/TGID/task/ directories */
3115 static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir)
3116 {
3117         struct dentry *dentry = filp->f_path.dentry;
3118         struct inode *inode = dentry->d_inode;
3119         struct task_struct *leader = NULL;
3120         struct task_struct *task;
3121         int retval = -ENOENT;
3122         ino_t ino;
3123         int tid;
3124         struct pid_namespace *ns;
3125
3126         task = get_proc_task(inode);
3127         if (!task)
3128                 goto out_no_task;
3129         rcu_read_lock();
3130         if (pid_alive(task)) {
3131                 leader = task->group_leader;
3132                 get_task_struct(leader);
3133         }
3134         rcu_read_unlock();
3135         put_task_struct(task);
3136         if (!leader)
3137                 goto out_no_task;
3138         retval = 0;
3139
3140         switch ((unsigned long)filp->f_pos) {
3141         case 0:
3142                 ino = inode->i_ino;
3143                 if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0)
3144                         goto out;
3145                 filp->f_pos++;
3146                 /* fall through */
3147         case 1:
3148                 ino = parent_ino(dentry);
3149                 if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0)
3150                         goto out;
3151                 filp->f_pos++;
3152                 /* fall through */
3153         }
3154
3155         /* f_version caches the tgid value that the last readdir call couldn't
3156          * return. lseek aka telldir automagically resets f_version to 0.
3157          */
3158         ns = filp->f_dentry->d_sb->s_fs_info;
3159         tid = (int)filp->f_version;
3160         filp->f_version = 0;
3161         for (task = first_tid(leader, tid, filp->f_pos - 2, ns);
3162              task;
3163              task = next_tid(task), filp->f_pos++) {
3164                 tid = task_pid_nr_ns(task, ns);
3165                 if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) {
3166                         /* returning this tgid failed, save it as the first
3167                          * pid for the next readir call */
3168                         filp->f_version = (u64)tid;
3169                         put_task_struct(task);
3170                         break;
3171                 }
3172         }
3173 out:
3174         put_task_struct(leader);
3175 out_no_task:
3176         return retval;
3177 }
3178
3179 static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
3180 {
3181         struct inode *inode = dentry->d_inode;
3182         struct task_struct *p = get_proc_task(inode);
3183         generic_fillattr(inode, stat);
3184
3185         if (p) {
3186                 stat->nlink += get_nr_threads(p);
3187                 put_task_struct(p);
3188         }
3189
3190         return 0;
3191 }
3192
3193 static const struct inode_operations proc_task_inode_operations = {
3194         .lookup         = proc_task_lookup,
3195         .getattr        = proc_task_getattr,
3196         .setattr        = proc_setattr,
3197 };
3198
3199 static const struct file_operations proc_task_operations = {
3200         .read           = generic_read_dir,
3201         .readdir        = proc_task_readdir,
3202 };