Import 2.3.12pre9
[davej-history.git] / kernel / fork.c
blob5a823b008864ad9c47ac3435c2846871dfc12881
1 /*
2 * linux/kernel/fork.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'fork.c' contains the help-routines for the 'fork' system call
9 * (see also system_call.s).
10 * Fork is rather simple, once you get the hang of it, but the memory
11 * management can be a bitch. See 'mm/mm.c': 'copy_page_tables()'
14 #include <linux/malloc.h>
15 #include <linux/init.h>
16 #include <linux/unistd.h>
17 #include <linux/smp_lock.h>
18 #include <linux/module.h>
19 #include <linux/vmalloc.h>
21 #include <asm/pgtable.h>
22 #include <asm/mmu_context.h>
23 #include <asm/uaccess.h>
25 /* The idle threads do not count.. */
26 int nr_threads=0;
27 int nr_running=0;
29 int max_threads;
30 unsigned long total_forks = 0; /* Handle normal Linux uptimes. */
31 int last_pid=0;
33 /* SLAB cache for mm_struct's. */
34 kmem_cache_t *mm_cachep;
36 /* SLAB cache for files structs */
37 kmem_cache_t *files_cachep;
39 struct task_struct *pidhash[PIDHASH_SZ];
41 /* UID task count cache, to prevent walking entire process list every
42 * single fork() operation.
44 #define UIDHASH_SZ (PIDHASH_SZ >> 2)
46 static struct user_struct {
47 atomic_t count;
48 struct user_struct *next, **pprev;
49 unsigned int uid;
50 } *uidhash[UIDHASH_SZ];
52 spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED;
54 kmem_cache_t *uid_cachep;
56 #define uidhashfn(uid) (((uid >> 8) ^ uid) & (UIDHASH_SZ - 1))
59 * These routines must be called with the uidhash spinlock held!
61 static inline void uid_hash_insert(struct user_struct *up, unsigned int hashent)
63 if((up->next = uidhash[hashent]) != NULL)
64 uidhash[hashent]->pprev = &up->next;
65 up->pprev = &uidhash[hashent];
66 uidhash[hashent] = up;
69 static inline void uid_hash_remove(struct user_struct *up)
71 if(up->next)
72 up->next->pprev = up->pprev;
73 *up->pprev = up->next;
76 static inline struct user_struct *uid_hash_find(unsigned short uid, unsigned int hashent)
78 struct user_struct *up, *next;
80 next = uidhash[hashent];
81 for (;;) {
82 up = next;
83 if (next) {
84 next = up->next;
85 if (up->uid != uid)
86 continue;
87 atomic_inc(&up->count);
89 break;
91 return up;
95 * For SMP, we need to re-test the user struct counter
96 * after having aquired the spinlock. This allows us to do
97 * the common case (not freeing anything) without having
98 * any locking.
100 #ifdef __SMP__
101 #define uid_hash_free(up) (!atomic_read(&(up)->count))
102 #else
103 #define uid_hash_free(up) (1)
104 #endif
106 void free_uid(struct task_struct *p)
108 struct user_struct *up = p->user;
110 if (up) {
111 p->user = NULL;
112 if (atomic_dec_and_test(&up->count)) {
113 spin_lock(&uidhash_lock);
114 if (uid_hash_free(up)) {
115 uid_hash_remove(up);
116 kmem_cache_free(uid_cachep, up);
118 spin_unlock(&uidhash_lock);
123 int alloc_uid(struct task_struct *p)
125 unsigned int hashent = uidhashfn(p->uid);
126 struct user_struct *up;
128 spin_lock(&uidhash_lock);
129 up = uid_hash_find(p->uid, hashent);
130 spin_unlock(&uidhash_lock);
132 if (!up) {
133 struct user_struct *new;
135 new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
136 if (!new)
137 return -EAGAIN;
138 new->uid = p->uid;
139 atomic_set(&new->count, 1);
142 * Before adding this, check whether we raced
143 * on adding the same user already..
145 spin_lock(&uidhash_lock);
146 up = uid_hash_find(p->uid, hashent);
147 if (up) {
148 kmem_cache_free(uid_cachep, new);
149 } else {
150 uid_hash_insert(new, hashent);
151 up = new;
153 spin_unlock(&uidhash_lock);
156 p->user = up;
157 return 0;
160 void __init fork_init(unsigned long memsize)
162 int i;
164 uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
166 SLAB_HWCACHE_ALIGN, NULL, NULL);
167 if(!uid_cachep)
168 panic("Cannot create uid taskcount SLAB cache\n");
170 for(i = 0; i < UIDHASH_SZ; i++)
171 uidhash[i] = 0;
174 * The default maximum number of threads is set to a safe
175 * value: the thread structures can take up at most half
176 * of memory.
178 max_threads = memsize / THREAD_SIZE / 2;
180 init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
181 init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
184 /* Protects next_safe and last_pid. */
185 spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
187 static int get_pid(unsigned long flags)
189 static int next_safe = PID_MAX;
190 struct task_struct *p;
192 if (flags & CLONE_PID)
193 return current->pid;
195 spin_lock(&lastpid_lock);
196 if((++last_pid) & 0xffff8000) {
197 last_pid = 300; /* Skip daemons etc. */
198 goto inside;
200 if(last_pid >= next_safe) {
201 inside:
202 next_safe = PID_MAX;
203 read_lock(&tasklist_lock);
204 repeat:
205 for_each_task(p) {
206 if(p->pid == last_pid ||
207 p->pgrp == last_pid ||
208 p->session == last_pid) {
209 if(++last_pid >= next_safe) {
210 if(last_pid & 0xffff8000)
211 last_pid = 300;
212 next_safe = PID_MAX;
214 goto repeat;
216 if(p->pid > last_pid && next_safe > p->pid)
217 next_safe = p->pid;
218 if(p->pgrp > last_pid && next_safe > p->pgrp)
219 next_safe = p->pgrp;
220 if(p->session > last_pid && next_safe > p->session)
221 next_safe = p->session;
223 read_unlock(&tasklist_lock);
225 spin_unlock(&lastpid_lock);
227 return last_pid;
230 static inline int dup_mmap(struct mm_struct * mm)
232 struct vm_area_struct * mpnt, *tmp, **pprev;
233 int retval;
235 /* Kill me slowly. UGLY! FIXME! */
236 memcpy(&mm->start_code, &current->mm->start_code, 15*sizeof(unsigned long));
238 flush_cache_mm(current->mm);
239 pprev = &mm->mmap;
240 for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
241 struct file *file;
243 retval = -ENOMEM;
244 tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
245 if (!tmp)
246 goto fail_nomem;
247 *tmp = *mpnt;
248 tmp->vm_flags &= ~VM_LOCKED;
249 tmp->vm_mm = mm;
250 mm->map_count++;
251 tmp->vm_next = NULL;
252 file = tmp->vm_file;
253 if (file) {
254 get_file(file);
255 if (tmp->vm_flags & VM_DENYWRITE)
256 atomic_dec(&file->f_dentry->d_inode->i_writecount);
258 /* insert tmp into the share list, just after mpnt */
259 spin_lock(&file->f_dentry->d_inode->i_shared_lock);
260 if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
261 mpnt->vm_next_share->vm_pprev_share =
262 &tmp->vm_next_share;
263 mpnt->vm_next_share = tmp;
264 tmp->vm_pprev_share = &mpnt->vm_next_share;
265 spin_unlock(&file->f_dentry->d_inode->i_shared_lock);
268 /* Copy the pages, but defer checking for errors */
269 retval = copy_page_range(mm, current->mm, tmp);
270 if (!retval && tmp->vm_ops && tmp->vm_ops->open)
271 tmp->vm_ops->open(tmp);
274 * Link in the new vma even if an error occurred,
275 * so that exit_mmap() can clean up the mess.
277 tmp->vm_next = *pprev;
278 *pprev = tmp;
280 pprev = &tmp->vm_next;
281 if (retval)
282 goto fail_nomem;
284 retval = 0;
285 if (mm->map_count >= AVL_MIN_MAP_COUNT)
286 build_mmap_avl(mm);
288 fail_nomem:
289 flush_tlb_mm(current->mm);
290 return retval;
294 * Allocate and initialize an mm_struct.
296 struct mm_struct * mm_alloc(void)
298 struct mm_struct * mm;
300 mm = kmem_cache_alloc(mm_cachep, SLAB_KERNEL);
301 if (mm) {
302 memset(mm, 0, sizeof(*mm));
303 atomic_set(&mm->mm_users, 1);
304 atomic_set(&mm->mm_count, 1);
305 init_MUTEX(&mm->mmap_sem);
306 mm->page_table_lock = SPIN_LOCK_UNLOCKED;
307 mm->pgd = pgd_alloc();
308 if (mm->pgd)
309 return mm;
310 kmem_cache_free(mm_cachep, mm);
312 return NULL;
316 * Called when the last reference to the mm
317 * is dropped: either by a lazy thread or by
318 * mmput. Free the page directory and the mm.
320 inline void __mmdrop(struct mm_struct *mm)
322 if (mm == &init_mm) BUG();
323 pgd_free(mm->pgd);
324 kmem_cache_free(mm_cachep, mm);
328 * Decrement the use count and release all resources for an mm.
330 void mmput(struct mm_struct *mm)
332 if (atomic_dec_and_test(&mm->mm_users)) {
333 exit_mmap(mm);
334 mmdrop(mm);
338 /* Please note the differences between mmput and mm_release.
339 * mmput is called whenever we stop holding onto a mm_struct,
340 * error success whatever.
342 * mm_release is called after a mm_struct has been removed
343 * from the current process.
345 * This difference is important for error handling, when we
346 * only half set up a mm_struct for a new process and need to restore
347 * the old one. Because we mmput the new mm_struct before
348 * restoring the old one. . .
349 * Eric Biederman 10 January 1998
351 void mm_release(void)
353 struct task_struct *tsk = current;
354 forget_segments();
355 /* notify parent sleeping on vfork() */
356 if (tsk->flags & PF_VFORK) {
357 tsk->flags &= ~PF_VFORK;
358 up(tsk->p_opptr->vfork_sem);
362 static inline int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
364 struct mm_struct * mm;
365 int retval;
367 tsk->min_flt = tsk->maj_flt = 0;
368 tsk->cmin_flt = tsk->cmaj_flt = 0;
369 tsk->nswap = tsk->cnswap = 0;
371 tsk->mm = NULL;
372 tsk->active_mm = NULL;
375 * Are we cloning a kernel thread?
377 * We need to steal a active VM for that..
379 mm = current->mm;
380 if (!mm)
381 return 0;
383 if (clone_flags & CLONE_VM) {
384 atomic_inc(&mm->mm_users);
385 goto good_mm;
388 retval = -ENOMEM;
389 mm = mm_alloc();
390 if (!mm)
391 goto fail_nomem;
393 tsk->mm = mm;
394 tsk->active_mm = mm;
397 * child gets a private LDT (if there was an LDT in the parent)
399 copy_segments(tsk, mm);
401 down(&current->mm->mmap_sem);
402 retval = dup_mmap(mm);
403 up(&current->mm->mmap_sem);
404 if (retval)
405 goto free_pt;
407 good_mm:
408 tsk->mm = mm;
409 tsk->active_mm = mm;
410 init_new_context(tsk,mm);
411 return 0;
413 free_pt:
414 mmput(mm);
415 fail_nomem:
416 return retval;
419 static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
421 if (clone_flags & CLONE_FS) {
422 atomic_inc(&current->fs->count);
423 return 0;
425 tsk->fs = kmalloc(sizeof(*tsk->fs), GFP_KERNEL);
426 if (!tsk->fs)
427 return -1;
428 atomic_set(&tsk->fs->count, 1);
429 tsk->fs->umask = current->fs->umask;
430 tsk->fs->root = dget(current->fs->root);
431 tsk->fs->pwd = dget(current->fs->pwd);
432 return 0;
435 static int count_open_files(struct files_struct *files, int size)
437 int i;
439 /* Find the last open fd */
440 for (i = size/(8*sizeof(long)); i > 0; ) {
441 if (files->open_fds->fds_bits[--i])
442 break;
444 i = (i+1) * 8 * sizeof(long);
445 return i;
448 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
450 struct files_struct *oldf, *newf;
451 struct file **old_fds, **new_fds;
452 int open_files, nfds, size, i, error = 0;
455 * A background process may not have any files ...
457 oldf = current->files;
458 if (!oldf)
459 goto out;
461 if (clone_flags & CLONE_FILES) {
462 atomic_inc(&oldf->count);
463 goto out;
466 tsk->files = NULL;
467 error = -ENOMEM;
468 newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
469 if (!newf)
470 goto out;
472 atomic_set(&newf->count, 1);
474 newf->file_lock = RW_LOCK_UNLOCKED;
475 newf->next_fd = 0;
476 newf->max_fds = NR_OPEN_DEFAULT;
477 newf->max_fdset = __FD_SETSIZE;
478 newf->close_on_exec = &newf->close_on_exec_init;
479 newf->open_fds = &newf->open_fds_init;
480 newf->fd = &newf->fd_array[0];
482 /* We don't yet have the oldf readlock, but even if the old
483 fdset gets grown now, we'll only copy up to "size" fds */
484 size = oldf->max_fdset;
485 if (size > __FD_SETSIZE) {
486 newf->max_fdset = 0;
487 write_lock(&newf->file_lock);
488 error = expand_fdset(newf, size);
489 write_unlock(&newf->file_lock);
490 if (error)
491 goto out_release;
493 read_lock(&oldf->file_lock);
495 open_files = count_open_files(oldf, size);
498 * Check whether we need to allocate a larger fd array.
499 * Note: we're not a clone task, so the open count won't
500 * change.
502 nfds = NR_OPEN_DEFAULT;
503 if (open_files > nfds) {
504 read_unlock(&oldf->file_lock);
505 newf->max_fds = 0;
506 write_lock(&newf->file_lock);
507 error = expand_fd_array(newf, open_files);
508 write_unlock(&newf->file_lock);
509 if (error)
510 goto out_release;
511 nfds = newf->max_fds;
512 read_lock(&oldf->file_lock);
515 old_fds = oldf->fd;
516 new_fds = newf->fd;
518 memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
519 memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);
521 for (i = open_files; i != 0; i--) {
522 struct file *f = *old_fds++;
523 if (f)
524 get_file(f);
525 *new_fds++ = f;
527 read_unlock(&oldf->file_lock);
529 /* compute the remainder to be cleared */
530 size = (newf->max_fds - open_files) * sizeof(struct file *);
532 /* This is long word aligned thus could use a optimized version */
533 memset(new_fds, 0, size);
535 if (newf->max_fdset > open_files) {
536 int left = (newf->max_fdset-open_files)/8;
537 int start = open_files / (8 * sizeof(unsigned long));
539 memset(&newf->open_fds->fds_bits[start], 0, left);
540 memset(&newf->close_on_exec->fds_bits[start], 0, left);
543 tsk->files = newf;
544 error = 0;
545 out:
546 return error;
548 out_release:
549 free_fdset (newf->close_on_exec, newf->max_fdset);
550 free_fdset (newf->open_fds, newf->max_fdset);
551 kmem_cache_free(files_cachep, newf);
552 goto out;
555 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
557 if (clone_flags & CLONE_SIGHAND) {
558 atomic_inc(&current->sig->count);
559 return 0;
561 tsk->sig = kmalloc(sizeof(*tsk->sig), GFP_KERNEL);
562 if (!tsk->sig)
563 return -1;
564 spin_lock_init(&tsk->sig->siglock);
565 atomic_set(&tsk->sig->count, 1);
566 memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action));
567 return 0;
570 static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
572 unsigned long new_flags = p->flags;
574 new_flags &= ~(PF_SUPERPRIV | PF_USEDFPU | PF_VFORK);
575 new_flags |= PF_FORKNOEXEC;
576 if (!(clone_flags & CLONE_PTRACE))
577 new_flags &= ~(PF_PTRACED|PF_TRACESYS);
578 if (clone_flags & CLONE_VFORK)
579 new_flags |= PF_VFORK;
580 p->flags = new_flags;
584 * Ok, this is the main fork-routine. It copies the system process
585 * information (task[nr]) and sets up the necessary registers. It
586 * also copies the data segment in its entirety.
588 int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs)
590 int retval = -ENOMEM;
591 struct task_struct *p;
592 DECLARE_MUTEX_LOCKED(sem);
594 current->vfork_sem = &sem;
596 p = alloc_task_struct();
597 if (!p)
598 goto fork_out;
600 *p = *current;
602 lock_kernel();
604 retval = -EAGAIN;
605 if (p->user) {
606 if (atomic_read(&p->user->count) >= p->rlim[RLIMIT_NPROC].rlim_cur)
607 goto bad_fork_free;
608 atomic_inc(&p->user->count);
612 * Counter atomicity is protected by
613 * the kernel lock
615 if (nr_threads >= max_threads)
616 goto bad_fork_cleanup_count;
618 if (p->exec_domain && p->exec_domain->module)
619 __MOD_INC_USE_COUNT(p->exec_domain->module);
620 if (p->binfmt && p->binfmt->module)
621 __MOD_INC_USE_COUNT(p->binfmt->module);
623 p->did_exec = 0;
624 p->swappable = 0;
625 p->state = TASK_UNINTERRUPTIBLE;
627 copy_flags(clone_flags, p);
628 p->pid = get_pid(clone_flags);
631 * This is a "shadow run" state. The process
632 * is marked runnable, but isn't actually on
633 * any run queue yet.. (that happens at the
634 * very end).
636 p->state = TASK_RUNNING;
637 p->run_list.next = NULL;
638 p->run_list.prev = NULL;
640 p->p_pptr = p->p_opptr = current;
641 p->p_cptr = NULL;
642 init_waitqueue_head(&p->wait_chldexit);
643 p->vfork_sem = NULL;
645 p->sigpending = 0;
646 sigemptyset(&p->signal);
647 p->sigqueue = NULL;
648 p->sigqueue_tail = &p->sigqueue;
650 p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
651 p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
652 init_timer(&p->real_timer);
653 p->real_timer.data = (unsigned long) p;
655 p->leader = 0; /* session leadership doesn't inherit */
656 p->tty_old_pgrp = 0;
657 p->times.tms_utime = p->times.tms_stime = 0;
658 p->times.tms_cutime = p->times.tms_cstime = 0;
659 #ifdef __SMP__
661 int i;
662 p->has_cpu = 0;
663 p->processor = current->processor;
664 /* ?? should we just memset this ?? */
665 for(i = 0; i < smp_num_cpus; i++)
666 p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
667 spin_lock_init(&p->sigmask_lock);
669 #endif
670 p->lock_depth = -1; /* -1 = no lock */
671 p->start_time = jiffies;
673 retval = -ENOMEM;
674 /* copy all the process information */
675 if (copy_files(clone_flags, p))
676 goto bad_fork_cleanup;
677 if (copy_fs(clone_flags, p))
678 goto bad_fork_cleanup_files;
679 if (copy_sighand(clone_flags, p))
680 goto bad_fork_cleanup_fs;
681 if (copy_mm(clone_flags, p))
682 goto bad_fork_cleanup_sighand;
683 retval = copy_thread(0, clone_flags, usp, p, regs);
684 if (retval)
685 goto bad_fork_cleanup_sighand;
686 p->semundo = NULL;
688 /* ok, now we should be set up.. */
689 p->swappable = 1;
690 p->exit_signal = clone_flags & CSIGNAL;
691 p->pdeath_signal = 0;
694 * "share" dynamic priority between parent and child, thus the
695 * total amount of dynamic priorities in the system doesnt change,
696 * more scheduling fairness. This is only important in the first
697 * timeslice, on the long run the scheduling behaviour is unchanged.
699 current->counter >>= 1;
700 p->counter = current->counter;
703 * Ok, add it to the run-queues and make it
704 * visible to the rest of the system.
706 * Let it rip!
708 retval = p->pid;
709 write_lock_irq(&tasklist_lock);
710 SET_LINKS(p);
711 hash_pid(p);
712 write_unlock_irq(&tasklist_lock);
714 nr_threads++;
715 wake_up_process(p); /* do this last */
716 ++total_forks;
718 bad_fork:
719 unlock_kernel();
720 fork_out:
721 if ((clone_flags & CLONE_VFORK) && (retval > 0))
722 down(&sem);
723 return retval;
725 bad_fork_cleanup_sighand:
726 exit_sighand(p);
727 bad_fork_cleanup_fs:
728 exit_fs(p); /* blocking */
729 bad_fork_cleanup_files:
730 exit_files(p); /* blocking */
731 bad_fork_cleanup:
732 if (p->exec_domain && p->exec_domain->module)
733 __MOD_DEC_USE_COUNT(p->exec_domain->module);
734 if (p->binfmt && p->binfmt->module)
735 __MOD_DEC_USE_COUNT(p->binfmt->module);
737 nr_threads--;
738 bad_fork_cleanup_count:
739 if (p->user)
740 free_uid(p);
741 bad_fork_free:
742 free_task_struct(p);
743 goto bad_fork;
746 void __init filescache_init(void)
748 files_cachep = kmem_cache_create("files_cache",
749 sizeof(struct files_struct),
751 SLAB_HWCACHE_ALIGN,
752 NULL, NULL);
753 if (!files_cachep)
754 panic("Cannot create files cache");