kernel/fork.c

   1 /*
   2  *  linux/kernel/fork.c
   3  *
   4  *  Copyright (C) 1991, 1992  Linus Torvalds
   5  */
   6
   7 /*
   8  *  'fork.c' contains the help-routines for the 'fork' system call
   9  * (see also system_call.s).
  10  * Fork is rather simple, once you get the hang of it, but the memory
  11  * management can be a bitch. See 'mm/mm.c': 'copy_page_tables()'
  12  */
  13
  14 #include <linux/init.h>
  15 #include <linux/errno.h>
  16 #include <linux/sched.h>
  17 #include <linux/kernel.h>
  18 #include <linux/mm.h>
  19 #include <linux/slab.h>
  20 #include <linux/unistd.h>
  21 #include <linux/ptrace.h>
  22 #include <linux/malloc.h>
  23 #include <linux/smp.h>
  24 #include <linux/smp_lock.h>
  25 #include <linux/module.h>
  26
  27 #include <asm/system.h>
  28 #include <asm/pgtable.h>
  29 #include <asm/mmu_context.h>
  30 #include <asm/uaccess.h>
  31
  32 int nr_tasks=1;
  33 int nr_running=1;
  34 unsigned long int total_forks=0;        /* Handle normal Linux uptimes. */
  35 int last_pid=0;
  36
  37 /* SLAB cache for mm_struct's. */
  38 kmem_cache_t *mm_cachep;
  39
  40 /* SLAB cache for files structs */
  41 kmem_cache_t *files_cachep;
  42
  43 struct task_struct *pidhash[PIDHASH_SZ];
  44
  45 struct task_struct **tarray_freelist = NULL;
  46 spinlock_t taskslot_lock = SPIN_LOCK_UNLOCKED;
  47
  48 /* UID task count cache, to prevent walking entire process list every
  49  * single fork() operation.
  50  */
  51 #define UIDHASH_SZ      (PIDHASH_SZ >> 2)
  52
  53 static struct user_struct {
  54         struct user_struct *next, **pprev;
  55         unsigned int uid;
  56         int task_count;
  57 } *uidhash[UIDHASH_SZ];
  58
  59 spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED;
  60
  61 kmem_cache_t *uid_cachep;
  62
  63 #define uidhashfn(uid)  (((uid >> 8) ^ uid) & (UIDHASH_SZ - 1))
  64
  65 static inline void uid_hash_insert(struct user_struct *up, unsigned int hashent)
  66 {
  67         spin_lock(&uidhash_lock);
  68         if((up->next = uidhash[hashent]) != NULL)
  69                 uidhash[hashent]->pprev = &up->next;
  70         up->pprev = &uidhash[hashent];
  71         uidhash[hashent] = up;
  72         spin_unlock(&uidhash_lock);
  73 }
  74
  75 static inline void uid_hash_remove(struct user_struct *up)
  76 {
  77         spin_lock(&uidhash_lock);
  78         if(up->next)
  79                 up->next->pprev = up->pprev;
  80         *up->pprev = up->next;
  81         spin_unlock(&uidhash_lock);
  82 }
  83
  84 static inline struct user_struct *uid_find(unsigned short uid, unsigned int hashent)
  85 {
  86         struct user_struct *up;
  87
  88         spin_lock(&uidhash_lock);
  89         for(up = uidhash[hashent]; (up && up->uid != uid); up = up->next)
  90                 ;
  91         spin_unlock(&uidhash_lock);
  92         return up;
  93 }
  94
  95 void free_uid(struct task_struct *p)
  96 {
  97         struct user_struct *up = p->user;
  98
  99         if (up) {
 100                 p->user = NULL;
 101                 lock_kernel();
 102                 if (!--up->task_count) {
 103                         uid_hash_remove(up);
 104                         kmem_cache_free(uid_cachep, up);
 105                 }
 106                 unlock_kernel();
 107         }
 108 }
 109
 110 int alloc_uid(struct task_struct *p)
 111 {
 112         unsigned int hashent = uidhashfn(p->uid);
 113         struct user_struct *up = uid_find(p->uid, hashent);
 114
 115         p->user = up;
 116         if (!up) {
 117                 up = kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
 118                 if (!up)
 119                         return -EAGAIN;
 120                 p->user = up;
 121                 up->uid = p->uid;
 122                 up->task_count = 0;
 123                 uid_hash_insert(up, hashent);
 124         }
 125
 126         up->task_count++;
 127         return 0;
 128 }
 129
 130 __initfunc(void uidcache_init(void))
 131 {
 132         int i;
 133
 134         uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
 135                                        0,
 136                                        SLAB_HWCACHE_ALIGN, NULL, NULL);
 137         if(!uid_cachep)
 138                 panic("Cannot create uid taskcount SLAB cache\n");
 139
 140         for(i = 0; i < UIDHASH_SZ; i++)
 141                 uidhash[i] = 0;
 142 }
 143
 144 static inline struct task_struct ** find_empty_process(void)
 145 {
 146         struct task_struct **tslot = NULL;
 147
 148         if (!current->uid || (nr_tasks < NR_TASKS - MIN_TASKS_LEFT_FOR_ROOT))
 149                 tslot = get_free_taskslot();
 150         return tslot;
 151 }
 152
 153 /* Protects next_safe and last_pid. */
 154 spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
 155
 156 static int get_pid(unsigned long flags)
 157 {
 158         static int next_safe = PID_MAX;
 159         struct task_struct *p;
 160
 161         if (flags & CLONE_PID)
 162                 return current->pid;
 163
 164         spin_lock(&lastpid_lock);
 165         if((++last_pid) & 0xffff8000) {
 166                 last_pid = 300;         /* Skip daemons etc. */
 167                 goto inside;
 168         }
 169         if(last_pid >= next_safe) {
 170 inside:
 171                 next_safe = PID_MAX;
 172                 read_lock(&tasklist_lock);
 173         repeat:
 174                 for_each_task(p) {
 175                         if(p->pid == last_pid   ||
 176                            p->pgrp == last_pid  ||
 177                            p->session == last_pid) {
 178                                 if(++last_pid >= next_safe) {
 179                                         if(last_pid & 0xffff8000)
 180                                                 last_pid = 300;
 181                                         next_safe = PID_MAX;
 182                                         goto repeat;
 183                                 }
 184                         }
 185                         if(p->pid > last_pid && next_safe > p->pid)
 186                                 next_safe = p->pid;
 187                         if(p->pgrp > last_pid && next_safe > p->pgrp)
 188                                 next_safe = p->pgrp;
 189                         if(p->session > last_pid && next_safe > p->session)
 190                                 next_safe = p->session;
 191                 }
 192                 read_unlock(&tasklist_lock);
 193         }
 194         spin_unlock(&lastpid_lock);
 195
 196         return last_pid;
 197 }
 198
 199 static inline int dup_mmap(struct mm_struct * mm)
 200 {
 201         struct vm_area_struct * mpnt, *tmp, **pprev;
 202         int retval;
 203
 204         flush_cache_mm(current->mm);
 205         pprev = &mm->mmap;
 206         for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
 207                 struct file *file;
 208
 209                 retval = -ENOMEM;
 210                 tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 211                 if (!tmp)
 212                         goto fail_nomem;
 213                 *tmp = *mpnt;
 214                 tmp->vm_flags &= ~VM_LOCKED;
 215                 tmp->vm_mm = mm;
 216                 mm->map_count++;
 217                 tmp->vm_next = NULL;
 218                 file = tmp->vm_file;
 219                 if (file) {
 220                         file->f_count++;
 221                         if (tmp->vm_flags & VM_DENYWRITE)
 222                                 file->f_dentry->d_inode->i_writecount--;
 223
 224                         /* insert tmp into the share list, just after mpnt */
 225                         if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
 226                                 mpnt->vm_next_share->vm_pprev_share =
 227                                         &tmp->vm_next_share;
 228                         mpnt->vm_next_share = tmp;
 229                         tmp->vm_pprev_share = &mpnt->vm_next_share;
 230                 }
 231
 232                 /* Copy the pages, but defer checking for errors */
 233                 retval = copy_page_range(mm, current->mm, tmp);
 234                 if (!retval && tmp->vm_ops && tmp->vm_ops->open)
 235                         tmp->vm_ops->open(tmp);
 236
 237                 /*
 238                  * Link in the new vma even if an error occurred,
 239                  * so that exit_mmap() can clean up the mess.
 240                  */
 241                 if((tmp->vm_next = *pprev) != NULL)
 242                         (*pprev)->vm_pprev = &tmp->vm_next;
 243                 *pprev = tmp;
 244                 tmp->vm_pprev = pprev;
 245
 246                 pprev = &tmp->vm_next;
 247                 if (retval)
 248                         goto fail_nomem;
 249         }
 250         retval = 0;
 251
 252 fail_nomem:
 253         flush_tlb_mm(current->mm);
 254         return retval;
 255 }
 256
 257 /*
 258  * Allocate and initialize an mm_struct.
 259  *
 260  * NOTE! The mm mutex will be locked until the
 261  * caller decides that all systems are go..
 262  */
 263 struct mm_struct * mm_alloc(void)
 264 {
 265         struct mm_struct * mm;
 266
 267         mm = kmem_cache_alloc(mm_cachep, SLAB_KERNEL);
 268         if (mm) {
 269                 *mm = *current->mm;
 270                 init_new_context(mm);
 271                 atomic_set(&mm->count, 1);
 272                 mm->map_count = 0;
 273                 mm->def_flags = 0;
 274                 mm->mmap_sem = MUTEX_LOCKED;
 275                 /*
 276                  * Leave mm->pgd set to the parent's pgd
 277                  * so that pgd_offset() is always valid.
 278                  */
 279                 mm->mmap = mm->mmap_cache = NULL;
 280
 281                 /* It has not run yet, so cannot be present in anyone's
 282                  * cache or tlb.
 283                  */
 284                 mm->cpu_vm_mask = 0;
 285         }
 286         return mm;
 287 }
 288
 289 /*
 290  * Decrement the use count and release all resources for an mm.
 291  */
 292 void mmput(struct mm_struct *mm)
 293 {
 294         if (atomic_dec_and_test(&mm->count)) {
 295                 release_segments(mm);
 296                 exit_mmap(mm);
 297                 free_page_tables(mm);
 298                 kmem_cache_free(mm_cachep, mm);
 299         }
 300 }
 301
 302 static inline int copy_mm(int nr, unsigned long clone_flags, struct task_struct * tsk)
 303 {
 304         struct mm_struct * mm;
 305         int retval;
 306
 307         if (clone_flags & CLONE_VM) {
 308                 mmget(current->mm);
 309                 /*
 310                  * Set up the LDT descriptor for the clone task.
 311                  */
 312                 copy_segments(nr, tsk, NULL);
 313                 SET_PAGE_DIR(tsk, current->mm->pgd);
 314                 return 0;
 315         }
 316
 317         retval = -ENOMEM;
 318         mm = mm_alloc();
 319         if (!mm)
 320                 goto fail_nomem;
 321
 322         tsk->mm = mm;
 323         tsk->min_flt = tsk->maj_flt = 0;
 324         tsk->cmin_flt = tsk->cmaj_flt = 0;
 325         tsk->nswap = tsk->cnswap = 0;
 326         copy_segments(nr, tsk, mm);
 327         retval = new_page_tables(tsk);
 328         if (retval)
 329                 goto free_mm;
 330         retval = dup_mmap(mm);
 331         if (retval)
 332                 goto free_pt;
 333         up(&mm->mmap_sem);
 334         return 0;
 335
 336 free_mm:
 337         mm->pgd = NULL;
 338 free_pt:
 339         tsk->mm = NULL;
 340         mmput(mm);
 341 fail_nomem:
 342         return retval;
 343 }
 344
 345 static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
 346 {
 347         if (clone_flags & CLONE_FS) {
 348                 atomic_inc(&current->fs->count);
 349                 return 0;
 350         }
 351         tsk->fs = kmalloc(sizeof(*tsk->fs), GFP_KERNEL);
 352         if (!tsk->fs)
 353                 return -1;
 354         atomic_set(&tsk->fs->count, 1);
 355         tsk->fs->umask = current->fs->umask;
 356         tsk->fs->root = dget(current->fs->root);
 357         tsk->fs->pwd = dget(current->fs->pwd);
 358         return 0;
 359 }
 360
 361 /* return value is only accurate by +-sizeof(long)*8 fds */
 362 /* XXX make this architecture specific */
 363 static inline int __copy_fdset(unsigned long *d, unsigned long *src)
 364 {
 365         int i;
 366         unsigned long *p = src;
 367         unsigned long *max = src;
 368
 369         for (i = __FDSET_LONGS; i; --i) {
 370                 if ((*d++ = *p++) != 0)
 371                         max = p;
 372         }
 373         return (max - src)*sizeof(long)*8;
 374 }
 375
 376 static inline int copy_fdset(fd_set *dst, fd_set *src)
 377 {
 378         return __copy_fdset(dst->fds_bits, src->fds_bits);
 379 }
 380
 381 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 382 {
 383         struct files_struct *oldf, *newf;
 384         struct file **old_fds, **new_fds;
 385         int size, i, error = 0;
 386
 387         /*
 388          * A background process may not have any files ...
 389          */
 390         oldf = current->files;
 391         if (!oldf)
 392                 goto out;
 393
 394         if (clone_flags & CLONE_FILES) {
 395                 atomic_inc(&oldf->count);
 396                 goto out;
 397         }
 398
 399         tsk->files = NULL;
 400         error = -ENOMEM;
 401         newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
 402         if (!newf)
 403                 goto out;
 404
 405         /*
 406          * Allocate the fd array, using get_free_page() if possible.
 407          * Eventually we want to make the array size variable ...
 408          */
 409         size = NR_OPEN * sizeof(struct file *);
 410         if (size == PAGE_SIZE)
 411                 new_fds = (struct file **) __get_free_page(GFP_KERNEL);
 412         else
 413                 new_fds = (struct file **) kmalloc(size, GFP_KERNEL);
 414         if (!new_fds)
 415                 goto out_release;
 416         memset((void *) new_fds, 0, size);
 417
 418         atomic_set(&newf->count, 1);
 419         newf->max_fds = NR_OPEN;
 420         newf->fd = new_fds;
 421         newf->close_on_exec = oldf->close_on_exec;
 422         i = copy_fdset(&newf->open_fds, &oldf->open_fds);
 423
 424         old_fds = oldf->fd;
 425         for (; i != 0; i--) {
 426                 struct file * f = *old_fds;
 427                 old_fds++;
 428                 *new_fds = f;
 429                 if (f)
 430                         f->f_count++;
 431                 new_fds++;
 432         }
 433         tsk->files = newf;
 434         error = 0;
 435 out:
 436         return error;
 437
 438 out_release:
 439         kmem_cache_free(files_cachep, newf);
 440         goto out;
 441 }
 442
 443 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 444 {
 445         if (clone_flags & CLONE_SIGHAND) {
 446                 atomic_inc(&current->sig->count);
 447                 return 0;
 448         }
 449         tsk->sig = kmalloc(sizeof(*tsk->sig), GFP_KERNEL);
 450         if (!tsk->sig)
 451                 return -1;
 452         spin_lock_init(&tsk->sig->siglock);
 453         atomic_set(&tsk->sig->count, 1);
 454         memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action));
 455         return 0;
 456 }
 457
 458 static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
 459 {
 460         unsigned long new_flags = p->flags;
 461
 462         new_flags &= ~PF_SUPERPRIV;
 463         new_flags |= PF_FORKNOEXEC;
 464         if (!(clone_flags & CLONE_PTRACE))
 465                 new_flags &= ~(PF_PTRACED|PF_TRACESYS);
 466         p->flags = new_flags;
 467 }
 468
 469 /*
 470  *  Ok, this is the main fork-routine. It copies the system process
 471  * information (task[nr]) and sets up the necessary registers. It
 472  * also copies the data segment in its entirety.
 473  */
 474 int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs)
 475 {
 476         int nr;
 477         int retval = -ENOMEM;
 478         struct task_struct *p;
 479
 480         p = alloc_task_struct();
 481         if (!p)
 482                 goto fork_out;
 483
 484         *p = *current;
 485
 486         down(&current->mm->mmap_sem);
 487         lock_kernel();
 488
 489         if (p->user) {
 490                 if (p->user->task_count >= p->rlim[RLIMIT_NPROC].rlim_cur)
 491                         goto bad_fork_free;
 492         }
 493
 494         {
 495                 struct task_struct **tslot;
 496                 tslot = find_empty_process();
 497                 retval = -EAGAIN;
 498                 if (!tslot)
 499                         goto bad_fork_free;
 500                 p->tarray_ptr = tslot;
 501                 *tslot = p;
 502                 nr = tslot - &task[0];
 503         }
 504
 505         if (p->exec_domain && p->exec_domain->module)
 506                 __MOD_INC_USE_COUNT(p->exec_domain->module);
 507         if (p->binfmt && p->binfmt->module)
 508                 __MOD_INC_USE_COUNT(p->binfmt->module);
 509
 510         p->did_exec = 0;
 511         p->swappable = 0;
 512         p->state = TASK_UNINTERRUPTIBLE;
 513
 514         copy_flags(clone_flags, p);
 515         p->pid = get_pid(clone_flags);
 516
 517         /*
 518          * This is a "shadow run" state. The process
 519          * is marked runnable, but isn't actually on
 520          * any run queue yet.. (that happens at the
 521          * very end).
 522          */
 523         p->state = TASK_RUNNING;
 524         p->next_run = p;
 525         p->prev_run = p;
 526
 527         p->p_pptr = p->p_opptr = current;
 528         p->p_cptr = NULL;
 529         init_waitqueue(&p->wait_chldexit);
 530
 531         p->sigpending = 0;
 532         sigemptyset(&p->signal);
 533         p->sigqueue = NULL;
 534         p->sigqueue_tail = &p->sigqueue;
 535
 536         p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
 537         p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
 538         init_timer(&p->real_timer);
 539         p->real_timer.data = (unsigned long) p;
 540
 541         p->leader = 0;          /* session leadership doesn't inherit */
 542         p->tty_old_pgrp = 0;
 543         p->times.tms_utime = p->times.tms_stime = 0;
 544         p->times.tms_cutime = p->times.tms_cstime = 0;
 545 #ifdef __SMP__
 546         {
 547                 int i;
 548                 p->has_cpu = 0;
 549                 p->processor = NO_PROC_ID;
 550                 /* ?? should we just memset this ?? */
 551                 for(i = 0; i < smp_num_cpus; i++)
 552                         p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
 553                 spin_lock_init(&p->sigmask_lock);
 554         }
 555 #endif
 556         p->lock_depth = -1;             /* -1 = no lock */
 557         p->start_time = jiffies;
 558
 559         {
 560                 /* This makes it visible to the rest of the system */
 561                 unsigned long flags;
 562                 write_lock_irqsave(&tasklist_lock, flags);
 563                 SET_LINKS(p);
 564                 hash_pid(p);
 565                 write_unlock_irqrestore(&tasklist_lock, flags);
 566         }
 567
 568         nr_tasks++;
 569         if (p->user)
 570                 p->user->task_count++;
 571
 572         retval = -ENOMEM;
 573         /* copy all the process information */
 574         if (copy_files(clone_flags, p))
 575                 goto bad_fork_cleanup;
 576         if (copy_fs(clone_flags, p))
 577                 goto bad_fork_cleanup_files;
 578         if (copy_sighand(clone_flags, p))
 579                 goto bad_fork_cleanup_fs;
 580         if (copy_mm(nr, clone_flags, p))
 581                 goto bad_fork_cleanup_sighand;
 582         retval = copy_thread(nr, clone_flags, usp, p, regs);
 583         if (retval)
 584                 goto bad_fork_cleanup_sighand;
 585         p->semundo = NULL;
 586
 587         /* ok, now we should be set up.. */
 588         p->swappable = 1;
 589         p->exit_signal = clone_flags & CSIGNAL;
 590         p->pdeath_signal = 0;
 591
 592         /*
 593          * "share" dynamic priority between parent and child, thus the
 594          * total amount of dynamic priorities in the system doesnt change,
 595          * more scheduling fairness. This is only important in the first
 596          * timeslice, on the long run the scheduling behaviour is unchanged.
 597          */
 598         current->counter >>= 1;
 599         p->counter = current->counter;
 600
 601         /* Ok, add it to the run-queues, let it rip! */
 602         retval = p->pid;
 603         if (retval) {
 604                 p->next_run = NULL;
 605                 p->prev_run = NULL;
 606                 wake_up_process(p);             /* do this last */
 607         }
 608         ++total_forks;
 609 bad_fork:
 610         up(&current->mm->mmap_sem);
 611         unlock_kernel();
 612 fork_out:
 613         return retval;
 614
 615 bad_fork_cleanup_sighand:
 616         exit_sighand(p);
 617 bad_fork_cleanup_fs:
 618         exit_fs(p); /* blocking */
 619 bad_fork_cleanup_files:
 620         exit_files(p); /* blocking */
 621 bad_fork_cleanup:
 622         if (p->exec_domain && p->exec_domain->module)
 623                 __MOD_DEC_USE_COUNT(p->exec_domain->module);
 624         if (p->binfmt && p->binfmt->module)
 625                 __MOD_DEC_USE_COUNT(p->binfmt->module);
 626
 627         {
 628                 unsigned long flags;
 629                 write_lock_irqsave(&tasklist_lock, flags);
 630                 unhash_pid(p);
 631                 REMOVE_LINKS(p);
 632                 write_unlock_irqrestore(&tasklist_lock, flags);
 633         }
 634
 635         if (p->user)
 636                 p->user->task_count++;
 637         nr_tasks--;
 638         add_free_taskslot(p->tarray_ptr);
 639 bad_fork_free:
 640         free_task_struct(p);
 641         goto bad_fork;
 642 }
 643
 644 static void files_ctor(void *fp, kmem_cache_t *cachep, unsigned long flags)
 645 {
 646         struct files_struct *f = fp;
 647
 648         memset(f, 0, sizeof(*f));
 649 }
 650
 651 __initfunc(void filescache_init(void))
 652 {
 653         files_cachep = kmem_cache_create("files_cache",
 654                                          sizeof(struct files_struct),
 655                                          0,
 656                                          SLAB_HWCACHE_ALIGN,
 657                                          files_ctor, NULL);
 658         if (!files_cachep)
 659                 panic("Cannot create files cache");
 660 }