kernel/fork.c

   1 /*
   2  *  linux/kernel/fork.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'fork.c' contains the help-routines for the 'fork' system call
   9  * (see also entry.S and others).
  10  * Fork is rather simple, once you get the hang of it, but the memory
  11  * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
  12  */
  13
  14 #include <linux/config.h>
  15 #include <linux/slab.h>
  16 #include <linux/init.h>
  17 #include <linux/unistd.h>
  18 #include <linux/smp_lock.h>
  19 #include <linux/module.h>
  20 #include <linux/vmalloc.h>
  21 #include <linux/completion.h>
  22 #include <linux/namespace.h>
  23 #include <linux/personality.h>
  24 #include <linux/file.h>
  25 #include <linux/binfmts.h>
  26 #include <linux/mman.h>
  27 #include <linux/fs.h>
  28 #include <linux/security.h>
  29 #include <linux/jiffies.h>
  30 #include <linux/futex.h>
  31 #include <linux/ptrace.h>
  32 #include <linux/mount.h>
  33
  34 #include <asm/pgtable.h>
  35 #include <asm/pgalloc.h>
  36 #include <asm/uaccess.h>
  37 #include <asm/mmu_context.h>
  38 #include <asm/cacheflush.h>
  39 #include <asm/tlbflush.h>
  40
  41 extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk);
  42 extern void exit_sem(struct task_struct *tsk);
  43
  44 /* The idle threads do not count..
  45  * Protected by write_lock_irq(&tasklist_lock)
  46  */
  47 int nr_threads;
  48
  49 int max_threads;
  50 unsigned long total_forks;      /* Handle normal Linux uptimes. */
  51
  52 DEFINE_PER_CPU(unsigned long, process_counts) = 0;
  53
  54 rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;  /* outer */
  55
  56 /*
  57  * A per-CPU task cache - this relies on the fact that
  58  * the very last portion of sys_exit() is executed with
  59  * preemption turned off.
  60  */
  61 static task_t *task_cache[NR_CPUS] __cacheline_aligned;
  62
  63 int nr_processes(void)
  64 {
  65         int cpu;
  66         int total = 0;
  67
  68         for (cpu = 0; cpu < NR_CPUS; cpu++) {
  69                 if (cpu_online(cpu))
  70                         total += per_cpu(process_counts, cpu);
  71         }
  72         return total;
  73 }
  74
  75 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
  76 # define alloc_task_struct()    kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
  77 # define free_task_struct(tsk)  kmem_cache_free(task_struct_cachep, (tsk))
  78 static kmem_cache_t *task_struct_cachep;
  79 #endif
  80
  81 static void free_task(struct task_struct *tsk)
  82 {
  83         /*
  84          * The task cache is effectively disabled right now.
  85          * Do we want it? The slab cache already has per-cpu
  86          * stuff, but the thread info (usually a order-1 page
  87          * allocation) doesn't.
  88          */
  89         if (tsk != current) {
  90                 free_thread_info(tsk->thread_info);
  91                 free_task_struct(tsk);
  92         } else {
  93                 int cpu = get_cpu();
  94
  95                 tsk = task_cache[cpu];
  96                 if (tsk) {
  97                         free_thread_info(tsk->thread_info);
  98                         free_task_struct(tsk);
  99                 }
 100                 task_cache[cpu] = current;
 101                 put_cpu();
 102         }
 103 }
 104
 105 void __put_task_struct(struct task_struct *tsk)
 106 {
 107         WARN_ON(!(tsk->state & (TASK_DEAD | TASK_ZOMBIE)));
 108         WARN_ON(atomic_read(&tsk->usage));
 109         WARN_ON(tsk == current);
 110
 111         security_task_free(tsk);
 112         free_uid(tsk->user);
 113         free_task(tsk);
 114 }
 115
 116 void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
 117 {
 118         unsigned long flags;
 119
 120         wait->flags &= ~WQ_FLAG_EXCLUSIVE;
 121         spin_lock_irqsave(&q->lock, flags);
 122         __add_wait_queue(q, wait);
 123         spin_unlock_irqrestore(&q->lock, flags);
 124 }
 125
 126 void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)
 127 {
 128         unsigned long flags;
 129
 130         wait->flags |= WQ_FLAG_EXCLUSIVE;
 131         spin_lock_irqsave(&q->lock, flags);
 132         __add_wait_queue_tail(q, wait);
 133         spin_unlock_irqrestore(&q->lock, flags);
 134 }
 135
 136 void remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
 137 {
 138         unsigned long flags;
 139
 140         spin_lock_irqsave(&q->lock, flags);
 141         __remove_wait_queue(q, wait);
 142         spin_unlock_irqrestore(&q->lock, flags);
 143 }
 144
 145 void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
 146 {
 147         unsigned long flags;
 148
 149         __set_current_state(state);
 150         wait->flags &= ~WQ_FLAG_EXCLUSIVE;
 151         spin_lock_irqsave(&q->lock, flags);
 152         if (list_empty(&wait->task_list))
 153                 __add_wait_queue(q, wait);
 154         spin_unlock_irqrestore(&q->lock, flags);
 155 }
 156
 157 void
 158 prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
 159 {
 160         unsigned long flags;
 161
 162         __set_current_state(state);
 163         wait->flags |= WQ_FLAG_EXCLUSIVE;
 164         spin_lock_irqsave(&q->lock, flags);
 165         if (list_empty(&wait->task_list))
 166                 __add_wait_queue_tail(q, wait);
 167         spin_unlock_irqrestore(&q->lock, flags);
 168 }
 169
 170 void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
 171 {
 172         unsigned long flags;
 173
 174         __set_current_state(TASK_RUNNING);
 175         if (!list_empty(&wait->task_list)) {
 176                 spin_lock_irqsave(&q->lock, flags);
 177                 list_del_init(&wait->task_list);
 178                 spin_unlock_irqrestore(&q->lock, flags);
 179         }
 180 }
 181
 182 int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync)
 183 {
 184         int ret = default_wake_function(wait, mode, sync);
 185
 186         if (ret)
 187                 list_del_init(&wait->task_list);
 188         return ret;
 189 }
 190
 191 void __init fork_init(unsigned long mempages)
 192 {
 193 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
 194         /* create a slab on which task_structs can be allocated */
 195         task_struct_cachep =
 196                 kmem_cache_create("task_struct",
 197                                   sizeof(struct task_struct),0,
 198                                   SLAB_MUST_HWCACHE_ALIGN, NULL, NULL);
 199         if (!task_struct_cachep)
 200                 panic("fork_init(): cannot create task_struct SLAB cache");
 201 #endif
 202
 203         /*
 204          * The default maximum number of threads is set to a safe
 205          * value: the thread structures can take up at most half
 206          * of memory.
 207          */
 208         max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8;
 209         /*
 210          * we need to allow at least 20 threads to boot a system
 211          */
 212         if(max_threads < 20)
 213                 max_threads = 20;
 214
 215         init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
 216         init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
 217 }
 218
 219 static struct task_struct *dup_task_struct(struct task_struct *orig)
 220 {
 221         struct task_struct *tsk;
 222         struct thread_info *ti;
 223         int cpu = get_cpu();
 224
 225         prepare_to_copy(orig);
 226
 227         tsk = task_cache[cpu];
 228         task_cache[cpu] = NULL;
 229         put_cpu();
 230         if (!tsk) {
 231                 tsk = alloc_task_struct();
 232                 if (!tsk)
 233                         return NULL;
 234
 235                 ti = alloc_thread_info(tsk);
 236                 if (!ti) {
 237                         free_task_struct(tsk);
 238                         return NULL;
 239                 }
 240         } else
 241                 ti = tsk->thread_info;
 242
 243         *ti = *orig->thread_info;
 244         *tsk = *orig;
 245         tsk->thread_info = ti;
 246         ti->task = tsk;
 247
 248         /* One for us, one for whoever does the "release_task()" (usually parent) */
 249         atomic_set(&tsk->usage,2);
 250         return tsk;
 251 }
 252
 253 #ifdef CONFIG_MMU
 254 static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 255 {
 256         struct vm_area_struct * mpnt, *tmp, **pprev;
 257         int retval;
 258         unsigned long charge = 0;
 259
 260         down_write(&oldmm->mmap_sem);
 261         flush_cache_mm(current->mm);
 262         mm->locked_vm = 0;
 263         mm->mmap = NULL;
 264         mm->mmap_cache = NULL;
 265         mm->free_area_cache = TASK_UNMAPPED_BASE;
 266         mm->map_count = 0;
 267         mm->rss = 0;
 268         mm->cpu_vm_mask = 0;
 269         pprev = &mm->mmap;
 270
 271         /*
 272          * Add it to the mmlist after the parent.
 273          * Doing it this way means that we can order the list,
 274          * and fork() won't mess up the ordering significantly.
 275          * Add it first so that swapoff can see any swap entries.
 276          */
 277         spin_lock(&mmlist_lock);
 278         list_add(&mm->mmlist, &current->mm->mmlist);
 279         mmlist_nr++;
 280         spin_unlock(&mmlist_lock);
 281
 282         for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
 283                 struct file *file;
 284
 285                 if(mpnt->vm_flags & VM_DONTCOPY)
 286                         continue;
 287                 if (mpnt->vm_flags & VM_ACCOUNT) {
 288                         unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT;
 289                         if (!vm_enough_memory(len))
 290                                 goto fail_nomem;
 291                         charge += len;
 292                 }
 293                 tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 294                 if (!tmp)
 295                         goto fail_nomem;
 296                 *tmp = *mpnt;
 297                 tmp->vm_flags &= ~VM_LOCKED;
 298                 tmp->vm_mm = mm;
 299                 tmp->vm_next = NULL;
 300                 file = tmp->vm_file;
 301                 INIT_LIST_HEAD(&tmp->shared);
 302                 if (file) {
 303                         struct inode *inode = file->f_dentry->d_inode;
 304                         get_file(file);
 305                         if (tmp->vm_flags & VM_DENYWRITE)
 306                                 atomic_dec(&inode->i_writecount);
 307
 308                         /* insert tmp into the share list, just after mpnt */
 309                         down(&inode->i_mapping->i_shared_sem);
 310                         list_add_tail(&tmp->shared, &mpnt->shared);
 311                         up(&inode->i_mapping->i_shared_sem);
 312                 }
 313
 314                 /*
 315                  * Link in the new vma and copy the page table entries:
 316                  * link in first so that swapoff can see swap entries.
 317                  */
 318                 spin_lock(&mm->page_table_lock);
 319                 *pprev = tmp;
 320                 pprev = &tmp->vm_next;
 321                 mm->map_count++;
 322                 retval = copy_page_range(mm, current->mm, tmp);
 323                 spin_unlock(&mm->page_table_lock);
 324
 325                 if (tmp->vm_ops && tmp->vm_ops->open)
 326                         tmp->vm_ops->open(tmp);
 327
 328                 if (retval)
 329                         goto fail;
 330         }
 331         retval = 0;
 332         build_mmap_rb(mm);
 333
 334 out:
 335         flush_tlb_mm(current->mm);
 336         up_write(&oldmm->mmap_sem);
 337         return retval;
 338 fail_nomem:
 339         retval = -ENOMEM;
 340   fail:
 341         vm_unacct_memory(charge);
 342         goto out;
 343 }
 344 static inline int mm_alloc_pgd(struct mm_struct * mm)
 345 {
 346         mm->pgd = pgd_alloc(mm);
 347         if (unlikely(!mm->pgd))
 348                 return -ENOMEM;
 349         return 0;
 350 }
 351
 352 static inline void mm_free_pgd(struct mm_struct * mm)
 353 {
 354         pgd_free(mm->pgd);
 355 }
 356 #else
 357 #define dup_mmap(mm, oldmm)     (0)
 358 #define mm_alloc_pgd(mm)        (0)
 359 #define mm_free_pgd(mm)
 360 #endif /* CONFIG_MMU */
 361
 362 spinlock_t mmlist_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
 363 int mmlist_nr;
 364
 365 #define allocate_mm()   (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
 366 #define free_mm(mm)     (kmem_cache_free(mm_cachep, (mm)))
 367
 368 #include <linux/init_task.h>
 369
 370 static struct mm_struct * mm_init(struct mm_struct * mm)
 371 {
 372         atomic_set(&mm->mm_users, 1);
 373         atomic_set(&mm->mm_count, 1);
 374         init_rwsem(&mm->mmap_sem);
 375         mm->core_waiters = 0;
 376         mm->page_table_lock = SPIN_LOCK_UNLOCKED;
 377         mm->ioctx_list_lock = RW_LOCK_UNLOCKED;
 378         mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
 379         mm->free_area_cache = TASK_UNMAPPED_BASE;
 380
 381         if (likely(!mm_alloc_pgd(mm))) {
 382                 mm->def_flags = 0;
 383                 return mm;
 384         }
 385         free_mm(mm);
 386         return NULL;
 387 }
 388
 389 /*
 390  * Allocate and initialize an mm_struct.
 391  */
 392 struct mm_struct * mm_alloc(void)
 393 {
 394         struct mm_struct * mm;
 395
 396         mm = allocate_mm();
 397         if (mm) {
 398                 memset(mm, 0, sizeof(*mm));
 399                 return mm_init(mm);
 400         }
 401         return NULL;
 402 }
 403
 404 /*
 405  * Called when the last reference to the mm
 406  * is dropped: either by a lazy thread or by
 407  * mmput. Free the page directory and the mm.
 408  */
 409 inline void __mmdrop(struct mm_struct *mm)
 410 {
 411         BUG_ON(mm == &init_mm);
 412         mm_free_pgd(mm);
 413         destroy_context(mm);
 414         free_mm(mm);
 415 }
 416
 417 /*
 418  * Decrement the use count and release all resources for an mm.
 419  */
 420 void mmput(struct mm_struct *mm)
 421 {
 422         if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) {
 423                 list_del(&mm->mmlist);
 424                 mmlist_nr--;
 425                 spin_unlock(&mmlist_lock);
 426                 exit_aio(mm);
 427                 exit_mmap(mm);
 428                 mmdrop(mm);
 429         }
 430 }
 431
 432 /* Please note the differences between mmput and mm_release.
 433  * mmput is called whenever we stop holding onto a mm_struct,
 434  * error success whatever.
 435  *
 436  * mm_release is called after a mm_struct has been removed
 437  * from the current process.
 438  *
 439  * This difference is important for error handling, when we
 440  * only half set up a mm_struct for a new process and need to restore
 441  * the old one.  Because we mmput the new mm_struct before
 442  * restoring the old one. . .
 443  * Eric Biederman 10 January 1998
 444  */
 445 void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 446 {
 447         struct completion *vfork_done = tsk->vfork_done;
 448
 449         /* Get rid of any cached register state */
 450         deactivate_mm(tsk, mm);
 451
 452         /* notify parent sleeping on vfork() */
 453         if (vfork_done) {
 454                 tsk->vfork_done = NULL;
 455                 complete(vfork_done);
 456         }
 457         if (tsk->clear_child_tid && atomic_read(&mm->mm_users) > 1) {
 458                 u32 __user * tidptr = tsk->clear_child_tid;
 459                 tsk->clear_child_tid = NULL;
 460
 461                 /*
 462                  * We don't check the error code - if userspace has
 463                  * not set up a proper pointer then tough luck.
 464                  */
 465                 put_user(0, tidptr);
 466                 sys_futex(tidptr, FUTEX_WAKE, 1, NULL, NULL);
 467         }
 468 }
 469
 470 static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 471 {
 472         struct mm_struct * mm, *oldmm;
 473         int retval;
 474
 475         tsk->min_flt = tsk->maj_flt = 0;
 476         tsk->cmin_flt = tsk->cmaj_flt = 0;
 477         tsk->nswap = tsk->cnswap = 0;
 478
 479         tsk->mm = NULL;
 480         tsk->active_mm = NULL;
 481
 482         /*
 483          * Are we cloning a kernel thread?
 484          *
 485          * We need to steal a active VM for that..
 486          */
 487         oldmm = current->mm;
 488         if (!oldmm)
 489                 return 0;
 490
 491         if (clone_flags & CLONE_VM) {
 492                 atomic_inc(&oldmm->mm_users);
 493                 mm = oldmm;
 494                 /*
 495                  * There are cases where the PTL is held to ensure no
 496                  * new threads start up in user mode using an mm, which
 497                  * allows optimizing out ipis; the tlb_gather_mmu code
 498                  * is an example.
 499                  */
 500                 spin_unlock_wait(&oldmm->page_table_lock);
 501                 goto good_mm;
 502         }
 503
 504         retval = -ENOMEM;
 505         mm = allocate_mm();
 506         if (!mm)
 507                 goto fail_nomem;
 508
 509         /* Copy the current MM stuff.. */
 510         memcpy(mm, oldmm, sizeof(*mm));
 511         if (!mm_init(mm))
 512                 goto fail_nomem;
 513
 514         if (init_new_context(tsk,mm))
 515                 goto free_pt;
 516
 517         retval = dup_mmap(mm, oldmm);
 518         if (retval)
 519                 goto free_pt;
 520
 521 good_mm:
 522         tsk->mm = mm;
 523         tsk->active_mm = mm;
 524         return 0;
 525
 526 free_pt:
 527         mmput(mm);
 528 fail_nomem:
 529         return retval;
 530 }
 531
 532 static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
 533 {
 534         struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
 535         /* We don't need to lock fs - think why ;-) */
 536         if (fs) {
 537                 atomic_set(&fs->count, 1);
 538                 fs->lock = RW_LOCK_UNLOCKED;
 539                 fs->umask = old->umask;
 540                 read_lock(&old->lock);
 541                 fs->rootmnt = mntget(old->rootmnt);
 542                 fs->root = dget(old->root);
 543                 fs->pwdmnt = mntget(old->pwdmnt);
 544                 fs->pwd = dget(old->pwd);
 545                 if (old->altroot) {
 546                         fs->altrootmnt = mntget(old->altrootmnt);
 547                         fs->altroot = dget(old->altroot);
 548                 } else {
 549                         fs->altrootmnt = NULL;
 550                         fs->altroot = NULL;
 551                 }
 552                 read_unlock(&old->lock);
 553         }
 554         return fs;
 555 }
 556
 557 struct fs_struct *copy_fs_struct(struct fs_struct *old)
 558 {
 559         return __copy_fs_struct(old);
 560 }
 561
 562 static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
 563 {
 564         if (clone_flags & CLONE_FS) {
 565                 atomic_inc(&current->fs->count);
 566                 return 0;
 567         }
 568         tsk->fs = __copy_fs_struct(current->fs);
 569         if (!tsk->fs)
 570                 return -ENOMEM;
 571         return 0;
 572 }
 573
 574 static int count_open_files(struct files_struct *files, int size)
 575 {
 576         int i;
 577
 578         /* Find the last open fd */
 579         for (i = size/(8*sizeof(long)); i > 0; ) {
 580                 if (files->open_fds->fds_bits[--i])
 581                         break;
 582         }
 583         i = (i+1) * 8 * sizeof(long);
 584         return i;
 585 }
 586
 587 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 588 {
 589         struct files_struct *oldf, *newf;
 590         struct file **old_fds, **new_fds;
 591         int open_files, nfds, size, i, error = 0;
 592
 593         /*
 594          * A background process may not have any files ...
 595          */
 596         oldf = current->files;
 597         if (!oldf)
 598                 goto out;
 599
 600         if (clone_flags & CLONE_FILES) {
 601                 atomic_inc(&oldf->count);
 602                 goto out;
 603         }
 604
 605         tsk->files = NULL;
 606         error = -ENOMEM;
 607         newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
 608         if (!newf)
 609                 goto out;
 610
 611         atomic_set(&newf->count, 1);
 612
 613         newf->file_lock     = SPIN_LOCK_UNLOCKED;
 614         newf->next_fd       = 0;
 615         newf->max_fds       = NR_OPEN_DEFAULT;
 616         newf->max_fdset     = __FD_SETSIZE;
 617         newf->close_on_exec = &newf->close_on_exec_init;
 618         newf->open_fds      = &newf->open_fds_init;
 619         newf->fd            = &newf->fd_array[0];
 620
 621         /* We don't yet have the oldf readlock, but even if the old
 622            fdset gets grown now, we'll only copy up to "size" fds */
 623         size = oldf->max_fdset;
 624         if (size > __FD_SETSIZE) {
 625                 newf->max_fdset = 0;
 626                 spin_lock(&newf->file_lock);
 627                 error = expand_fdset(newf, size-1);
 628                 spin_unlock(&newf->file_lock);
 629                 if (error)
 630                         goto out_release;
 631         }
 632         spin_lock(&oldf->file_lock);
 633
 634         open_files = count_open_files(oldf, size);
 635
 636         /*
 637          * Check whether we need to allocate a larger fd array.
 638          * Note: we're not a clone task, so the open count won't
 639          * change.
 640          */
 641         nfds = NR_OPEN_DEFAULT;
 642         if (open_files > nfds) {
 643                 spin_unlock(&oldf->file_lock);
 644                 newf->max_fds = 0;
 645                 spin_lock(&newf->file_lock);
 646                 error = expand_fd_array(newf, open_files-1);
 647                 spin_unlock(&newf->file_lock);
 648                 if (error)
 649                         goto out_release;
 650                 nfds = newf->max_fds;
 651                 spin_lock(&oldf->file_lock);
 652         }
 653
 654         old_fds = oldf->fd;
 655         new_fds = newf->fd;
 656
 657         memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
 658         memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
 659
 660         for (i = open_files; i != 0; i--) {
 661                 struct file *f = *old_fds++;
 662                 if (f)
 663                         get_file(f);
 664                 *new_fds++ = f;
 665         }
 666         spin_unlock(&oldf->file_lock);
 667
 668         /* compute the remainder to be cleared */
 669         size = (newf->max_fds - open_files) * sizeof(struct file *);
 670
 671         /* This is long word aligned thus could use a optimized version */
 672         memset(new_fds, 0, size);
 673
 674         if (newf->max_fdset > open_files) {
 675                 int left = (newf->max_fdset-open_files)/8;
 676                 int start = open_files / (8 * sizeof(unsigned long));
 677
 678                 memset(&newf->open_fds->fds_bits[start], 0, left);
 679                 memset(&newf->close_on_exec->fds_bits[start], 0, left);
 680         }
 681
 682         tsk->files = newf;
 683         error = 0;
 684 out:
 685         return error;
 686
 687 out_release:
 688         free_fdset (newf->close_on_exec, newf->max_fdset);
 689         free_fdset (newf->open_fds, newf->max_fdset);
 690         kmem_cache_free(files_cachep, newf);
 691         goto out;
 692 }
 693
 694 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 695 {
 696         struct sighand_struct *sig;
 697
 698         if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) {
 699                 atomic_inc(&current->sighand->count);
 700                 return 0;
 701         }
 702         sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
 703         tsk->sighand = sig;
 704         if (!sig)
 705                 return -ENOMEM;
 706         spin_lock_init(&sig->siglock);
 707         atomic_set(&sig->count, 1);
 708         memcpy(sig->action, current->sighand->action, sizeof(sig->action));
 709         return 0;
 710 }
 711
 712 static inline int copy_signal(unsigned long clone_flags, struct task_struct * tsk)
 713 {
 714         struct signal_struct *sig;
 715
 716         if (clone_flags & CLONE_THREAD) {
 717                 atomic_inc(&current->signal->count);
 718                 return 0;
 719         }
 720         sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
 721         tsk->signal = sig;
 722         if (!sig)
 723                 return -ENOMEM;
 724         atomic_set(&sig->count, 1);
 725         sig->group_exit = 0;
 726         sig->group_exit_code = 0;
 727         sig->group_exit_task = NULL;
 728         sig->group_stop_count = 0;
 729         sig->curr_target = NULL;
 730         init_sigpending(&sig->shared_pending);
 731
 732         return 0;
 733 }
 734
 735 static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
 736 {
 737         unsigned long new_flags = p->flags;
 738
 739         new_flags &= ~PF_SUPERPRIV;
 740         new_flags |= PF_FORKNOEXEC;
 741         if (!(clone_flags & CLONE_PTRACE))
 742                 p->ptrace = 0;
 743         p->flags = new_flags;
 744 }
 745
 746 asmlinkage long sys_set_tid_address(int __user *tidptr)
 747 {
 748         current->clear_child_tid = tidptr;
 749
 750         return current->pid;
 751 }
 752
 753 /*
 754  * This creates a new process as a copy of the old one,
 755  * but does not actually start it yet.
 756  *
 757  * It copies the registers, and all the appropriate
 758  * parts of the process environment (as per the clone
 759  * flags). The actual kick-off is left to the caller.
 760  */
 761 struct task_struct *copy_process(unsigned long clone_flags,
 762                                  unsigned long stack_start,
 763                                  struct pt_regs *regs,
 764                                  unsigned long stack_size,
 765                                  int __user *parent_tidptr,
 766                                  int __user *child_tidptr)
 767 {
 768         int retval;
 769         struct task_struct *p = NULL;
 770
 771         if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
 772                 return ERR_PTR(-EINVAL);
 773
 774         /*
 775          * Thread groups must share signals as well, and detached threads
 776          * can only be started up within the thread group.
 777          */
 778         if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
 779                 return ERR_PTR(-EINVAL);
 780         if ((clone_flags & CLONE_DETACHED) && !(clone_flags & CLONE_THREAD))
 781                 return ERR_PTR(-EINVAL);
 782
 783         retval = security_task_create(clone_flags);
 784         if (retval)
 785                 goto fork_out;
 786
 787         retval = -ENOMEM;
 788         p = dup_task_struct(current);
 789         if (!p)
 790                 goto fork_out;
 791
 792         retval = -EAGAIN;
 793         if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur) {
 794                 if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE))
 795                         goto bad_fork_free;
 796         }
 797
 798         atomic_inc(&p->user->__count);
 799         atomic_inc(&p->user->processes);
 800
 801         /*
 802          * If multiple threads are within copy_process(), then this check
 803          * triggers too late. This doesn't hurt, the check is only there
 804          * to stop root fork bombs.
 805          */
 806         if (nr_threads >= max_threads)
 807                 goto bad_fork_cleanup_count;
 808
 809         if (!try_module_get(p->thread_info->exec_domain->module))
 810                 goto bad_fork_cleanup_count;
 811
 812         if (p->binfmt && !try_module_get(p->binfmt->module))
 813                 goto bad_fork_cleanup_put_domain;
 814
 815 #ifdef CONFIG_PREEMPT
 816         /*
 817          * schedule_tail drops this_rq()->lock so we compensate with a count
 818          * of 1.  Also, we want to start with kernel preemption disabled.
 819          */
 820         p->thread_info->preempt_count = 1;
 821 #endif
 822         p->did_exec = 0;
 823         p->state = TASK_UNINTERRUPTIBLE;
 824
 825         copy_flags(clone_flags, p);
 826         if (clone_flags & CLONE_IDLETASK)
 827                 p->pid = 0;
 828         else {
 829                 p->pid = alloc_pidmap();
 830                 if (p->pid == -1)
 831                         goto bad_fork_cleanup;
 832         }
 833         retval = -EFAULT;
 834         if (clone_flags & CLONE_PARENT_SETTID)
 835                 if (put_user(p->pid, parent_tidptr))
 836                         goto bad_fork_cleanup;
 837
 838         p->proc_dentry = NULL;
 839
 840         INIT_LIST_HEAD(&p->run_list);
 841
 842         INIT_LIST_HEAD(&p->children);
 843         INIT_LIST_HEAD(&p->sibling);
 844         INIT_LIST_HEAD(&p->posix_timers);
 845         init_waitqueue_head(&p->wait_chldexit);
 846         p->vfork_done = NULL;
 847         spin_lock_init(&p->alloc_lock);
 848         spin_lock_init(&p->switch_lock);
 849         spin_lock_init(&p->proc_lock);
 850
 851         clear_tsk_thread_flag(p, TIF_SIGPENDING);
 852         init_sigpending(&p->pending);
 853
 854         p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
 855         p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
 856         init_timer(&p->real_timer);
 857         p->real_timer.data = (unsigned long) p;
 858
 859         p->leader = 0;          /* session leadership doesn't inherit */
 860         p->tty_old_pgrp = 0;
 861         p->utime = p->stime = 0;
 862         p->cutime = p->cstime = 0;
 863         p->array = NULL;
 864         p->lock_depth = -1;             /* -1 = no lock */
 865         p->start_time = get_jiffies_64();
 866         p->security = NULL;
 867
 868         retval = -ENOMEM;
 869         if ((retval = security_task_alloc(p)))
 870                 goto bad_fork_cleanup;
 871         /* copy all the process information */
 872         if ((retval = copy_semundo(clone_flags, p)))
 873                 goto bad_fork_cleanup_security;
 874         if ((retval = copy_files(clone_flags, p)))
 875                 goto bad_fork_cleanup_semundo;
 876         if ((retval = copy_fs(clone_flags, p)))
 877                 goto bad_fork_cleanup_files;
 878         if ((retval = copy_sighand(clone_flags, p)))
 879                 goto bad_fork_cleanup_fs;
 880         if ((retval = copy_signal(clone_flags, p)))
 881                 goto bad_fork_cleanup_sighand;
 882         if ((retval = copy_mm(clone_flags, p)))
 883                 goto bad_fork_cleanup_signal;
 884         if ((retval = copy_namespace(clone_flags, p)))
 885                 goto bad_fork_cleanup_mm;
 886         retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
 887         if (retval)
 888                 goto bad_fork_cleanup_namespace;
 889
 890         p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
 891         /*
 892          * Clear TID on mm_release()?
 893          */
 894         p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
 895
 896         /*
 897          * Syscall tracing should be turned off in the child regardless
 898          * of CLONE_PTRACE.
 899          */
 900         clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
 901
 902         /* Our parent execution domain becomes current domain
 903            These must match for thread signalling to apply */
 904
 905         p->parent_exec_id = p->self_exec_id;
 906
 907         /* ok, now we should be set up.. */
 908         if (clone_flags & CLONE_DETACHED)
 909                 p->exit_signal = -1;
 910         else
 911                 p->exit_signal = clone_flags & CSIGNAL;
 912         p->pdeath_signal = 0;
 913
 914         /*
 915          * Share the timeslice between parent and child, thus the
 916          * total amount of pending timeslices in the system doesn't change,
 917          * resulting in more scheduling fairness.
 918          */
 919         local_irq_disable();
 920         p->time_slice = (current->time_slice + 1) >> 1;
 921         /*
 922          * The remainder of the first timeslice might be recovered by
 923          * the parent if the child exits early enough.
 924          */
 925         p->first_time_slice = 1;
 926         current->time_slice >>= 1;
 927         p->last_run = jiffies;
 928         if (!current->time_slice) {
 929                 /*
 930                  * This case is rare, it happens when the parent has only
 931                  * a single jiffy left from its timeslice. Taking the
 932                  * runqueue lock is not a problem.
 933                  */
 934                 current->time_slice = 1;
 935                 preempt_disable();
 936                 scheduler_tick(0, 0);
 937                 local_irq_enable();
 938                 preempt_enable();
 939         } else
 940                 local_irq_enable();
 941         /*
 942          * Ok, add it to the run-queues and make it
 943          * visible to the rest of the system.
 944          *
 945          * Let it rip!
 946          */
 947         p->tgid = p->pid;
 948         p->group_leader = p;
 949         INIT_LIST_HEAD(&p->ptrace_children);
 950         INIT_LIST_HEAD(&p->ptrace_list);
 951
 952         /* Need tasklist lock for parent etc handling! */
 953         write_lock_irq(&tasklist_lock);
 954         /*
 955          * Check for pending SIGKILL! The new thread should not be allowed
 956          * to slip out of an OOM kill. (or normal SIGKILL.)
 957          */
 958         if (sigismember(&current->pending.signal, SIGKILL)) {
 959                 write_unlock_irq(&tasklist_lock);
 960                 retval = -EINTR;
 961                 goto bad_fork_cleanup_namespace;
 962         }
 963
 964         /* CLONE_PARENT re-uses the old parent */
 965         if (clone_flags & CLONE_PARENT)
 966                 p->real_parent = current->real_parent;
 967         else
 968                 p->real_parent = current;
 969         p->parent = p->real_parent;
 970
 971         if (clone_flags & CLONE_THREAD) {
 972                 spin_lock(&current->sighand->siglock);
 973                 /*
 974                  * Important: if an exit-all has been started then
 975                  * do not create this new thread - the whole thread
 976                  * group is supposed to exit anyway.
 977                  */
 978                 if (current->signal->group_exit) {
 979                         spin_unlock(&current->sighand->siglock);
 980                         write_unlock_irq(&tasklist_lock);
 981                         goto bad_fork_cleanup_namespace;
 982                 }
 983                 p->tgid = current->tgid;
 984                 p->group_leader = current->group_leader;
 985
 986                 if (current->signal->group_stop_count > 0) {
 987                         /*
 988                          * There is an all-stop in progress for the group.
 989                          * We ourselves will stop as soon as we check signals.
 990                          * Make the new thread part of that group stop too.
 991                          */
 992                         current->signal->group_stop_count++;
 993                         set_tsk_thread_flag(p, TIF_SIGPENDING);
 994                 }
 995
 996                 spin_unlock(&current->sighand->siglock);
 997         }
 998
 999         SET_LINKS(p);
1000         if (p->ptrace & PT_PTRACED)
1001                 __ptrace_link(p, current->parent);
1002
1003         attach_pid(p, PIDTYPE_PID, p->pid);
1004         if (thread_group_leader(p)) {
1005                 attach_pid(p, PIDTYPE_TGID, p->tgid);
1006                 attach_pid(p, PIDTYPE_PGID, p->pgrp);
1007                 attach_pid(p, PIDTYPE_SID, p->session);
1008                 if (p->pid)
1009                         __get_cpu_var(process_counts)++;
1010         } else
1011                 link_pid(p, p->pids + PIDTYPE_TGID, &p->group_leader->pids[PIDTYPE_TGID].pid);
1012
1013         nr_threads++;
1014         write_unlock_irq(&tasklist_lock);
1015         retval = 0;
1016
1017 fork_out:
1018         if (retval)
1019                 return ERR_PTR(retval);
1020         return p;
1021
1022 bad_fork_cleanup_namespace:
1023         exit_namespace(p);
1024 bad_fork_cleanup_mm:
1025         exit_mm(p);
1026 bad_fork_cleanup_signal:
1027         exit_signal(p);
1028 bad_fork_cleanup_sighand:
1029         exit_sighand(p);
1030 bad_fork_cleanup_fs:
1031         exit_fs(p); /* blocking */
1032 bad_fork_cleanup_files:
1033         exit_files(p); /* blocking */
1034 bad_fork_cleanup_semundo:
1035         exit_sem(p);
1036 bad_fork_cleanup_security:
1037         security_task_free(p);
1038 bad_fork_cleanup:
1039         if (p->pid > 0)
1040                 free_pidmap(p->pid);
1041         if (p->binfmt)
1042                 module_put(p->binfmt->module);
1043 bad_fork_cleanup_put_domain:
1044         module_put(p->thread_info->exec_domain->module);
1045 bad_fork_cleanup_count:
1046         atomic_dec(&p->user->processes);
1047         free_uid(p->user);
1048 bad_fork_free:
1049         free_task(p);
1050         goto fork_out;
1051 }
1052
1053 static inline int fork_traceflag (unsigned clone_flags)
1054 {
1055         if (clone_flags & (CLONE_UNTRACED | CLONE_IDLETASK))
1056                 return 0;
1057         else if (clone_flags & CLONE_VFORK) {
1058                 if (current->ptrace & PT_TRACE_VFORK)
1059                         return PTRACE_EVENT_VFORK;
1060         } else if ((clone_flags & CSIGNAL) != SIGCHLD) {
1061                 if (current->ptrace & PT_TRACE_CLONE)
1062                         return PTRACE_EVENT_CLONE;
1063         } else if (current->ptrace & PT_TRACE_FORK)
1064                 return PTRACE_EVENT_FORK;
1065
1066         return 0;
1067 }
1068
1069 /*
1070  *  Ok, this is the main fork-routine.
1071  *
1072  * It copies the process, and if successful kick-starts
1073  * it and waits for it to finish using the VM if required.
1074  */
1075 long do_fork(unsigned long clone_flags,
1076               unsigned long stack_start,
1077               struct pt_regs *regs,
1078               unsigned long stack_size,
1079               int __user *parent_tidptr,
1080               int __user *child_tidptr)
1081 {
1082         struct task_struct *p;
1083         int trace = 0;
1084         long pid;
1085
1086         if (unlikely(current->ptrace)) {
1087                 trace = fork_traceflag (clone_flags);
1088                 if (trace)
1089                         clone_flags |= CLONE_PTRACE;
1090         }
1091
1092         p = copy_process(clone_flags, stack_start, regs, stack_size, parent_tidptr, child_tidptr);
1093         /*
1094          * Do this prior waking up the new thread - the thread pointer
1095          * might get invalid after that point, if the thread exits quickly.
1096          */
1097         pid = IS_ERR(p) ? PTR_ERR(p) : p->pid;
1098
1099         if (!IS_ERR(p)) {
1100                 struct completion vfork;
1101
1102                 if (clone_flags & CLONE_VFORK) {
1103                         p->vfork_done = &vfork;
1104                         init_completion(&vfork);
1105                 }
1106
1107                 if (p->ptrace & PT_PTRACED) {
1108                         /*
1109                          * We'll start up with an immediate SIGSTOP.
1110                          */
1111                         sigaddset(&p->pending.signal, SIGSTOP);
1112                         set_tsk_thread_flag(p, TIF_SIGPENDING);
1113                 }
1114
1115                 wake_up_forked_process(p);              /* do this last */
1116                 ++total_forks;
1117
1118                 if (unlikely (trace)) {
1119                         current->ptrace_message = pid;
1120                         ptrace_notify ((trace << 8) | SIGTRAP);
1121                 }
1122
1123                 if (clone_flags & CLONE_VFORK) {
1124                         wait_for_completion(&vfork);
1125                         if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
1126                                 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
1127                 } else
1128                         /*
1129                          * Let the child process run first, to avoid most of the
1130                          * COW overhead when the child exec()s afterwards.
1131                          */
1132                         set_need_resched();
1133         }
1134         return pid;
1135 }
1136
1137 /* SLAB cache for signal_struct structures (tsk->signal) */
1138 kmem_cache_t *signal_cachep;
1139
1140 /* SLAB cache for sighand_struct structures (tsk->sighand) */
1141 kmem_cache_t *sighand_cachep;
1142
1143 /* SLAB cache for files_struct structures (tsk->files) */
1144 kmem_cache_t *files_cachep;
1145
1146 /* SLAB cache for fs_struct structures (tsk->fs) */
1147 kmem_cache_t *fs_cachep;
1148
1149 /* SLAB cache for vm_area_struct structures */
1150 kmem_cache_t *vm_area_cachep;
1151
1152 /* SLAB cache for mm_struct structures (tsk->mm) */
1153 kmem_cache_t *mm_cachep;
1154
1155 void __init proc_caches_init(void)
1156 {
1157         sighand_cachep = kmem_cache_create("sighand_cache",
1158                         sizeof(struct sighand_struct), 0,
1159                         SLAB_HWCACHE_ALIGN, NULL, NULL);
1160         if (!sighand_cachep)
1161                 panic("Cannot create sighand SLAB cache");
1162
1163         signal_cachep = kmem_cache_create("signal_cache",
1164                         sizeof(struct signal_struct), 0,
1165                         SLAB_HWCACHE_ALIGN, NULL, NULL);
1166         if (!signal_cachep)
1167                 panic("Cannot create signal SLAB cache");
1168
1169         files_cachep = kmem_cache_create("files_cache",
1170                          sizeof(struct files_struct), 0,
1171                          SLAB_HWCACHE_ALIGN, NULL, NULL);
1172         if (!files_cachep)
1173                 panic("Cannot create files SLAB cache");
1174
1175         fs_cachep = kmem_cache_create("fs_cache",
1176                          sizeof(struct fs_struct), 0,
1177                          SLAB_HWCACHE_ALIGN, NULL, NULL);
1178         if (!fs_cachep)
1179                 panic("Cannot create fs_struct SLAB cache");
1180
1181         vm_area_cachep = kmem_cache_create("vm_area_struct",
1182                         sizeof(struct vm_area_struct), 0,
1183                         0, NULL, NULL);
1184         if(!vm_area_cachep)
1185                 panic("vma_init: Cannot alloc vm_area_struct SLAB cache");
1186
1187         mm_cachep = kmem_cache_create("mm_struct",
1188                         sizeof(struct mm_struct), 0,
1189                         SLAB_HWCACHE_ALIGN, NULL, NULL);
1190         if(!mm_cachep)
1191                 panic("vma_init: Cannot alloc mm_struct SLAB cache");
1192 }