Import 2.3.4pre2
[davej-history.git] / kernel / fork.c
blobe85429ba4f51250d3b3770540fd24ac3e21ab570
1 /*
2 * linux/kernel/fork.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'fork.c' contains the help-routines for the 'fork' system call
9 * (see also system_call.s).
10 * Fork is rather simple, once you get the hang of it, but the memory
11 * management can be a bitch. See 'mm/mm.c': 'copy_page_tables()'
14 #include <linux/malloc.h>
15 #include <linux/init.h>
16 #include <linux/unistd.h>
17 #include <linux/smp_lock.h>
18 #include <linux/module.h>
19 #include <linux/vmalloc.h>
21 #include <asm/pgtable.h>
22 #include <asm/mmu_context.h>
23 #include <asm/uaccess.h>
25 /* The idle tasks do not count.. */
26 int nr_tasks=0;
27 int nr_running=0;
29 unsigned long int total_forks=0; /* Handle normal Linux uptimes. */
30 int last_pid=0;
32 /* SLAB cache for mm_struct's. */
33 kmem_cache_t *mm_cachep;
35 /* SLAB cache for files structs */
36 kmem_cache_t *files_cachep;
38 struct task_struct *pidhash[PIDHASH_SZ];
40 struct task_struct **tarray_freelist = NULL;
41 spinlock_t taskslot_lock = SPIN_LOCK_UNLOCKED;
43 /* UID task count cache, to prevent walking entire process list every
44 * single fork() operation.
46 #define UIDHASH_SZ (PIDHASH_SZ >> 2)
48 static struct user_struct {
49 atomic_t count;
50 struct user_struct *next, **pprev;
51 unsigned int uid;
52 } *uidhash[UIDHASH_SZ];
54 spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED;
56 kmem_cache_t *uid_cachep;
58 #define uidhashfn(uid) (((uid >> 8) ^ uid) & (UIDHASH_SZ - 1))
61 * These routines must be called with the uidhash spinlock held!
63 static inline void uid_hash_insert(struct user_struct *up, unsigned int hashent)
65 if((up->next = uidhash[hashent]) != NULL)
66 uidhash[hashent]->pprev = &up->next;
67 up->pprev = &uidhash[hashent];
68 uidhash[hashent] = up;
71 static inline void uid_hash_remove(struct user_struct *up)
73 if(up->next)
74 up->next->pprev = up->pprev;
75 *up->pprev = up->next;
78 static inline struct user_struct *uid_hash_find(unsigned short uid, unsigned int hashent)
80 struct user_struct *up, *next;
82 next = uidhash[hashent];
83 for (;;) {
84 up = next;
85 if (next) {
86 next = up->next;
87 if (up->uid != uid)
88 continue;
89 atomic_inc(&up->count);
91 break;
93 return up;
97 * For SMP, we need to re-test the user struct counter
98 * after having aquired the spinlock. This allows us to do
99 * the common case (not freeing anything) without having
100 * any locking.
102 #ifdef __SMP__
103 #define uid_hash_free(up) (!atomic_read(&(up)->count))
104 #else
105 #define uid_hash_free(up) (1)
106 #endif
108 void free_uid(struct task_struct *p)
110 struct user_struct *up = p->user;
112 if (up) {
113 p->user = NULL;
114 if (atomic_dec_and_test(&up->count)) {
115 spin_lock(&uidhash_lock);
116 if (uid_hash_free(up)) {
117 uid_hash_remove(up);
118 kmem_cache_free(uid_cachep, up);
120 spin_unlock(&uidhash_lock);
125 int alloc_uid(struct task_struct *p)
127 unsigned int hashent = uidhashfn(p->uid);
128 struct user_struct *up;
130 spin_lock(&uidhash_lock);
131 up = uid_hash_find(p->uid, hashent);
132 spin_unlock(&uidhash_lock);
134 if (!up) {
135 struct user_struct *new;
137 new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
138 if (!new)
139 return -EAGAIN;
140 new->uid = p->uid;
141 atomic_set(&new->count, 1);
144 * Before adding this, check whether we raced
145 * on adding the same user already..
147 spin_lock(&uidhash_lock);
148 up = uid_hash_find(p->uid, hashent);
149 if (up) {
150 kmem_cache_free(uid_cachep, new);
151 } else {
152 uid_hash_insert(new, hashent);
153 up = new;
155 spin_unlock(&uidhash_lock);
158 p->user = up;
159 return 0;
162 void __init uidcache_init(void)
164 int i;
166 uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
168 SLAB_HWCACHE_ALIGN, NULL, NULL);
169 if(!uid_cachep)
170 panic("Cannot create uid taskcount SLAB cache\n");
172 for(i = 0; i < UIDHASH_SZ; i++)
173 uidhash[i] = 0;
176 static inline struct task_struct ** find_empty_process(void)
178 struct task_struct **tslot = NULL;
180 if ((nr_tasks < NR_TASKS - MIN_TASKS_LEFT_FOR_ROOT) || !current->uid)
181 tslot = get_free_taskslot();
182 return tslot;
185 /* Protects next_safe and last_pid. */
186 spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
188 static int get_pid(unsigned long flags)
190 static int next_safe = PID_MAX;
191 struct task_struct *p;
193 if (flags & CLONE_PID)
194 return current->pid;
196 spin_lock(&lastpid_lock);
197 if((++last_pid) & 0xffff8000) {
198 last_pid = 300; /* Skip daemons etc. */
199 goto inside;
201 if(last_pid >= next_safe) {
202 inside:
203 next_safe = PID_MAX;
204 read_lock(&tasklist_lock);
205 repeat:
206 for_each_task(p) {
207 if(p->pid == last_pid ||
208 p->pgrp == last_pid ||
209 p->session == last_pid) {
210 if(++last_pid >= next_safe) {
211 if(last_pid & 0xffff8000)
212 last_pid = 300;
213 next_safe = PID_MAX;
215 goto repeat;
217 if(p->pid > last_pid && next_safe > p->pid)
218 next_safe = p->pid;
219 if(p->pgrp > last_pid && next_safe > p->pgrp)
220 next_safe = p->pgrp;
221 if(p->session > last_pid && next_safe > p->session)
222 next_safe = p->session;
224 read_unlock(&tasklist_lock);
226 spin_unlock(&lastpid_lock);
228 return last_pid;
231 static inline int dup_mmap(struct mm_struct * mm)
233 struct vm_area_struct * mpnt, *tmp, **pprev;
234 int retval;
236 flush_cache_mm(current->mm);
237 pprev = &mm->mmap;
238 for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
239 struct file *file;
241 retval = -ENOMEM;
242 tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
243 if (!tmp)
244 goto fail_nomem;
245 *tmp = *mpnt;
246 tmp->vm_flags &= ~VM_LOCKED;
247 tmp->vm_mm = mm;
248 mm->map_count++;
249 tmp->vm_next = NULL;
250 file = tmp->vm_file;
251 if (file) {
252 file->f_count++;
253 if (tmp->vm_flags & VM_DENYWRITE)
254 file->f_dentry->d_inode->i_writecount--;
256 /* insert tmp into the share list, just after mpnt */
257 if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
258 mpnt->vm_next_share->vm_pprev_share =
259 &tmp->vm_next_share;
260 mpnt->vm_next_share = tmp;
261 tmp->vm_pprev_share = &mpnt->vm_next_share;
264 /* Copy the pages, but defer checking for errors */
265 retval = copy_page_range(mm, current->mm, tmp);
266 if (!retval && tmp->vm_ops && tmp->vm_ops->open)
267 tmp->vm_ops->open(tmp);
270 * Link in the new vma even if an error occurred,
271 * so that exit_mmap() can clean up the mess.
273 tmp->vm_next = *pprev;
274 *pprev = tmp;
276 pprev = &tmp->vm_next;
277 if (retval)
278 goto fail_nomem;
280 retval = 0;
281 if (mm->map_count >= AVL_MIN_MAP_COUNT)
282 build_mmap_avl(mm);
284 fail_nomem:
285 flush_tlb_mm(current->mm);
286 return retval;
290 * Allocate and initialize an mm_struct.
292 * NOTE! The mm mutex will be locked until the
293 * caller decides that all systems are go..
295 struct mm_struct * mm_alloc(void)
297 struct mm_struct * mm;
299 mm = kmem_cache_alloc(mm_cachep, SLAB_KERNEL);
300 if (mm) {
301 *mm = *current->mm;
302 init_new_context(mm);
303 atomic_set(&mm->count, 1);
304 mm->map_count = 0;
305 mm->def_flags = 0;
306 init_MUTEX_LOCKED(&mm->mmap_sem);
308 * Leave mm->pgd set to the parent's pgd
309 * so that pgd_offset() is always valid.
311 mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL;
313 /* It has not run yet, so cannot be present in anyone's
314 * cache or tlb.
316 mm->cpu_vm_mask = 0;
318 return mm;
321 /* Please note the differences between mmput and mm_release.
322 * mmput is called whenever we stop holding onto a mm_struct,
323 * error success whatever.
325 * mm_release is called after a mm_struct has been removed
326 * from the current process.
328 * This difference is important for error handling, when we
329 * only half set up a mm_struct for a new process and need to restore
330 * the old one. Because we mmput the new mm_struct before
331 * restoring the old one. . .
332 * Eric Biederman 10 January 1998
334 void mm_release(void)
336 struct task_struct *tsk = current;
337 forget_segments();
338 /* notify parent sleeping on vfork() */
339 if (tsk->flags & PF_VFORK) {
340 tsk->flags &= ~PF_VFORK;
341 up(tsk->p_opptr->vfork_sem);
346 * Decrement the use count and release all resources for an mm.
348 void mmput(struct mm_struct *mm)
350 if (atomic_dec_and_test(&mm->count)) {
351 release_segments(mm);
352 exit_mmap(mm);
353 free_page_tables(mm);
354 kmem_cache_free(mm_cachep, mm);
358 static inline int copy_mm(int nr, unsigned long clone_flags, struct task_struct * tsk)
360 struct mm_struct * mm;
361 int retval;
363 if (clone_flags & CLONE_VM) {
364 mmget(current->mm);
366 * Set up the LDT descriptor for the clone task.
368 copy_segments(nr, tsk, NULL);
369 SET_PAGE_DIR(tsk, current->mm->pgd);
370 return 0;
373 retval = -ENOMEM;
374 mm = mm_alloc();
375 if (!mm)
376 goto fail_nomem;
378 tsk->mm = mm;
379 tsk->min_flt = tsk->maj_flt = 0;
380 tsk->cmin_flt = tsk->cmaj_flt = 0;
381 tsk->nswap = tsk->cnswap = 0;
382 copy_segments(nr, tsk, mm);
383 retval = new_page_tables(tsk);
384 if (retval)
385 goto free_mm;
386 retval = dup_mmap(mm);
387 if (retval)
388 goto free_pt;
389 up(&mm->mmap_sem);
390 return 0;
392 free_mm:
393 tsk->mm = NULL;
394 release_segments(mm);
395 kmem_cache_free(mm_cachep, mm);
396 return retval;
397 free_pt:
398 tsk->mm = NULL;
399 mmput(mm);
400 fail_nomem:
401 return retval;
404 static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
406 if (clone_flags & CLONE_FS) {
407 atomic_inc(&current->fs->count);
408 return 0;
410 tsk->fs = kmalloc(sizeof(*tsk->fs), GFP_KERNEL);
411 if (!tsk->fs)
412 return -1;
413 atomic_set(&tsk->fs->count, 1);
414 tsk->fs->umask = current->fs->umask;
415 tsk->fs->root = dget(current->fs->root);
416 tsk->fs->pwd = dget(current->fs->pwd);
417 return 0;
421 * Copy a fd_set and compute the maximum fd it contains.
423 static inline int __copy_fdset(unsigned long *d, unsigned long *src)
425 int i;
426 unsigned long *p = src;
427 unsigned long *max = src;
429 for (i = __FDSET_LONGS; i; --i) {
430 if ((*d++ = *p++) != 0)
431 max = p;
433 return (max - src)*sizeof(long)*8;
436 static inline int copy_fdset(fd_set *dst, fd_set *src)
438 return __copy_fdset(dst->fds_bits, src->fds_bits);
441 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
443 struct files_struct *oldf, *newf;
444 struct file **old_fds, **new_fds;
445 int size, i, error = 0;
448 * A background process may not have any files ...
450 oldf = current->files;
451 if (!oldf)
452 goto out;
454 if (clone_flags & CLONE_FILES) {
455 atomic_inc(&oldf->count);
456 goto out;
459 tsk->files = NULL;
460 error = -ENOMEM;
461 newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
462 if (!newf)
463 goto out;
466 * Allocate the fd array, using get_free_page() if possible.
467 * Eventually we want to make the array size variable ...
469 size = NR_OPEN * sizeof(struct file *);
470 if (size == PAGE_SIZE)
471 new_fds = (struct file **) __get_free_page(GFP_KERNEL);
472 else
473 new_fds = (struct file **) kmalloc(size, GFP_KERNEL);
474 if (!new_fds)
475 goto out_release;
477 atomic_set(&newf->count, 1);
478 newf->max_fds = NR_OPEN;
479 newf->fd = new_fds;
480 newf->close_on_exec = oldf->close_on_exec;
481 i = copy_fdset(&newf->open_fds, &oldf->open_fds);
483 old_fds = oldf->fd;
484 for (; i != 0; i--) {
485 struct file *f = *old_fds++;
486 *new_fds = f;
487 if (f)
488 f->f_count++;
489 new_fds++;
491 /* This is long word aligned thus could use a optimized version */
492 memset(new_fds, 0, (char *)newf->fd + size - (char *)new_fds);
494 tsk->files = newf;
495 error = 0;
496 out:
497 return error;
499 out_release:
500 kmem_cache_free(files_cachep, newf);
501 goto out;
504 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
506 if (clone_flags & CLONE_SIGHAND) {
507 atomic_inc(&current->sig->count);
508 return 0;
510 tsk->sig = kmalloc(sizeof(*tsk->sig), GFP_KERNEL);
511 if (!tsk->sig)
512 return -1;
513 spin_lock_init(&tsk->sig->siglock);
514 atomic_set(&tsk->sig->count, 1);
515 memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action));
516 return 0;
519 static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
521 unsigned long new_flags = p->flags;
523 new_flags &= ~(PF_SUPERPRIV | PF_USEDFPU | PF_VFORK);
524 new_flags |= PF_FORKNOEXEC;
525 if (!(clone_flags & CLONE_PTRACE))
526 new_flags &= ~(PF_PTRACED|PF_TRACESYS);
527 if (clone_flags & CLONE_VFORK)
528 new_flags |= PF_VFORK;
529 p->flags = new_flags;
533 * Ok, this is the main fork-routine. It copies the system process
534 * information (task[nr]) and sets up the necessary registers. It
535 * also copies the data segment in its entirety.
537 int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs)
539 int nr;
540 int retval = -ENOMEM;
541 struct task_struct *p;
542 DECLARE_MUTEX_LOCKED(sem);
544 current->vfork_sem = &sem;
546 p = alloc_task_struct();
547 if (!p)
548 goto fork_out;
550 *p = *current;
552 down(&current->mm->mmap_sem);
553 lock_kernel();
555 retval = -EAGAIN;
556 if (p->user) {
557 if (atomic_read(&p->user->count) >= p->rlim[RLIMIT_NPROC].rlim_cur)
558 goto bad_fork_free;
562 struct task_struct **tslot;
563 tslot = find_empty_process();
564 if (!tslot)
565 goto bad_fork_free;
566 p->tarray_ptr = tslot;
567 *tslot = p;
568 nr = tslot - &task[0];
571 if (p->exec_domain && p->exec_domain->module)
572 __MOD_INC_USE_COUNT(p->exec_domain->module);
573 if (p->binfmt && p->binfmt->module)
574 __MOD_INC_USE_COUNT(p->binfmt->module);
576 p->did_exec = 0;
577 p->swappable = 0;
578 p->state = TASK_UNINTERRUPTIBLE;
580 copy_flags(clone_flags, p);
581 p->pid = get_pid(clone_flags);
584 * This is a "shadow run" state. The process
585 * is marked runnable, but isn't actually on
586 * any run queue yet.. (that happens at the
587 * very end).
589 p->state = TASK_RUNNING;
590 p->next_run = p;
591 p->prev_run = p;
593 p->p_pptr = p->p_opptr = current;
594 p->p_cptr = NULL;
595 init_waitqueue_head(&p->wait_chldexit);
596 p->vfork_sem = NULL;
598 p->sigpending = 0;
599 sigemptyset(&p->signal);
600 p->sigqueue = NULL;
601 p->sigqueue_tail = &p->sigqueue;
603 p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
604 p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
605 init_timer(&p->real_timer);
606 p->real_timer.data = (unsigned long) p;
608 p->leader = 0; /* session leadership doesn't inherit */
609 p->tty_old_pgrp = 0;
610 p->times.tms_utime = p->times.tms_stime = 0;
611 p->times.tms_cutime = p->times.tms_cstime = 0;
612 #ifdef __SMP__
614 int i;
615 p->has_cpu = 0;
616 p->processor = NO_PROC_ID;
617 /* ?? should we just memset this ?? */
618 for(i = 0; i < smp_num_cpus; i++)
619 p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
620 spin_lock_init(&p->sigmask_lock);
622 #endif
623 p->lock_depth = -1; /* -1 = no lock */
624 p->start_time = jiffies;
626 retval = -ENOMEM;
627 /* copy all the process information */
628 if (copy_files(clone_flags, p))
629 goto bad_fork_cleanup;
630 if (copy_fs(clone_flags, p))
631 goto bad_fork_cleanup_files;
632 if (copy_sighand(clone_flags, p))
633 goto bad_fork_cleanup_fs;
634 if (copy_mm(nr, clone_flags, p))
635 goto bad_fork_cleanup_sighand;
636 retval = copy_thread(nr, clone_flags, usp, p, regs);
637 if (retval)
638 goto bad_fork_cleanup_sighand;
639 p->semundo = NULL;
641 /* ok, now we should be set up.. */
642 p->swappable = 1;
643 p->exit_signal = clone_flags & CSIGNAL;
644 p->pdeath_signal = 0;
647 * "share" dynamic priority between parent and child, thus the
648 * total amount of dynamic priorities in the system doesnt change,
649 * more scheduling fairness. This is only important in the first
650 * timeslice, on the long run the scheduling behaviour is unchanged.
652 current->counter >>= 1;
653 p->counter = current->counter;
656 * Ok, add it to the run-queues and make it
657 * visible to the rest of the system.
659 * Let it rip!
661 retval = p->pid;
662 if (retval) {
663 write_lock_irq(&tasklist_lock);
664 SET_LINKS(p);
665 hash_pid(p);
666 write_unlock_irq(&tasklist_lock);
668 nr_tasks++;
669 if (p->user)
670 atomic_inc(&p->user->count);
672 p->next_run = NULL;
673 p->prev_run = NULL;
674 wake_up_process(p); /* do this last */
676 ++total_forks;
677 bad_fork:
678 unlock_kernel();
679 up(&current->mm->mmap_sem);
680 fork_out:
681 if ((clone_flags & CLONE_VFORK) && (retval > 0))
682 down(&sem);
683 return retval;
685 bad_fork_cleanup_sighand:
686 exit_sighand(p);
687 bad_fork_cleanup_fs:
688 exit_fs(p); /* blocking */
689 bad_fork_cleanup_files:
690 exit_files(p); /* blocking */
691 bad_fork_cleanup:
692 if (p->exec_domain && p->exec_domain->module)
693 __MOD_DEC_USE_COUNT(p->exec_domain->module);
694 if (p->binfmt && p->binfmt->module)
695 __MOD_DEC_USE_COUNT(p->binfmt->module);
697 add_free_taskslot(p->tarray_ptr);
698 bad_fork_free:
699 free_task_struct(p);
700 goto bad_fork;
703 void __init filescache_init(void)
705 files_cachep = kmem_cache_create("files_cache",
706 sizeof(struct files_struct),
708 SLAB_HWCACHE_ALIGN,
709 NULL, NULL);
710 if (!files_cachep)
711 panic("Cannot create files cache");