Import 2.1.116pre2
[davej-history.git] / kernel / fork.c
blob5b9e24163f33eef3fff4cf9f7d4f44c84cf52191
1 /*
2 * linux/kernel/fork.c
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
7 /*
8 * 'fork.c' contains the help-routines for the 'fork' system call
9 * (see also system_call.s).
10 * Fork is rather simple, once you get the hang of it, but the memory
11 * management can be a bitch. See 'mm/mm.c': 'copy_page_tables()'
14 #include <linux/init.h>
15 #include <linux/errno.h>
16 #include <linux/sched.h>
17 #include <linux/kernel.h>
18 #include <linux/mm.h>
19 #include <linux/slab.h>
20 #include <linux/unistd.h>
21 #include <linux/ptrace.h>
22 #include <linux/malloc.h>
23 #include <linux/smp.h>
24 #include <linux/smp_lock.h>
25 #include <linux/module.h>
27 #include <asm/system.h>
28 #include <asm/pgtable.h>
29 #include <asm/mmu_context.h>
30 #include <asm/uaccess.h>
32 int nr_tasks=1;
33 int nr_running=1;
34 unsigned long int total_forks=0; /* Handle normal Linux uptimes. */
35 int last_pid=0;
37 /* SLAB cache for mm_struct's. */
38 kmem_cache_t *mm_cachep;
40 /* SLAB cache for files structs */
41 kmem_cache_t *files_cachep;
43 struct task_struct *pidhash[PIDHASH_SZ];
45 struct task_struct **tarray_freelist = NULL;
46 spinlock_t taskslot_lock = SPIN_LOCK_UNLOCKED;
48 /* UID task count cache, to prevent walking entire process list every
49 * single fork() operation.
51 #define UIDHASH_SZ (PIDHASH_SZ >> 2)
53 static struct user_struct {
54 struct user_struct *next, **pprev;
55 unsigned int uid;
56 int task_count;
57 } *uidhash[UIDHASH_SZ];
59 spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED;
61 kmem_cache_t *uid_cachep;
63 #define uidhashfn(uid) (((uid >> 8) ^ uid) & (UIDHASH_SZ - 1))
65 static inline void uid_hash_insert(struct user_struct *up, unsigned int hashent)
67 spin_lock(&uidhash_lock);
68 if((up->next = uidhash[hashent]) != NULL)
69 uidhash[hashent]->pprev = &up->next;
70 up->pprev = &uidhash[hashent];
71 uidhash[hashent] = up;
72 spin_unlock(&uidhash_lock);
75 static inline void uid_hash_remove(struct user_struct *up)
77 spin_lock(&uidhash_lock);
78 if(up->next)
79 up->next->pprev = up->pprev;
80 *up->pprev = up->next;
81 spin_unlock(&uidhash_lock);
84 static inline struct user_struct *uid_find(unsigned short uid, unsigned int hashent)
86 struct user_struct *up;
88 spin_lock(&uidhash_lock);
89 for(up = uidhash[hashent]; (up && up->uid != uid); up = up->next)
91 spin_unlock(&uidhash_lock);
92 return up;
95 void free_uid(struct task_struct *p)
97 struct user_struct *up = p->user;
99 if (up) {
100 p->user = NULL;
101 lock_kernel();
102 if (!--up->task_count) {
103 uid_hash_remove(up);
104 kmem_cache_free(uid_cachep, up);
106 unlock_kernel();
110 int alloc_uid(struct task_struct *p)
112 unsigned int hashent = uidhashfn(p->uid);
113 struct user_struct *up = uid_find(p->uid, hashent);
115 p->user = up;
116 if (!up) {
117 up = kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
118 if (!up)
119 return -EAGAIN;
120 p->user = up;
121 up->uid = p->uid;
122 up->task_count = 0;
123 uid_hash_insert(up, hashent);
126 up->task_count++;
127 return 0;
130 __initfunc(void uidcache_init(void))
132 int i;
134 uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct),
136 SLAB_HWCACHE_ALIGN, NULL, NULL);
137 if(!uid_cachep)
138 panic("Cannot create uid taskcount SLAB cache\n");
140 for(i = 0; i < UIDHASH_SZ; i++)
141 uidhash[i] = 0;
144 static inline struct task_struct ** find_empty_process(void)
146 struct task_struct **tslot = NULL;
148 if (!current->uid || (nr_tasks < NR_TASKS - MIN_TASKS_LEFT_FOR_ROOT))
149 tslot = get_free_taskslot();
150 return tslot;
153 /* Protects next_safe and last_pid. */
154 spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
156 static int get_pid(unsigned long flags)
158 static int next_safe = PID_MAX;
159 struct task_struct *p;
161 if (flags & CLONE_PID)
162 return current->pid;
164 spin_lock(&lastpid_lock);
165 if((++last_pid) & 0xffff8000) {
166 last_pid = 300; /* Skip daemons etc. */
167 goto inside;
169 if(last_pid >= next_safe) {
170 inside:
171 next_safe = PID_MAX;
172 read_lock(&tasklist_lock);
173 repeat:
174 for_each_task(p) {
175 if(p->pid == last_pid ||
176 p->pgrp == last_pid ||
177 p->session == last_pid) {
178 if(++last_pid >= next_safe) {
179 if(last_pid & 0xffff8000)
180 last_pid = 300;
181 next_safe = PID_MAX;
182 goto repeat;
185 if(p->pid > last_pid && next_safe > p->pid)
186 next_safe = p->pid;
187 if(p->pgrp > last_pid && next_safe > p->pgrp)
188 next_safe = p->pgrp;
189 if(p->session > last_pid && next_safe > p->session)
190 next_safe = p->session;
192 read_unlock(&tasklist_lock);
194 spin_unlock(&lastpid_lock);
196 return last_pid;
199 static inline int dup_mmap(struct mm_struct * mm)
201 struct vm_area_struct * mpnt, *tmp, **pprev;
202 int retval;
204 flush_cache_mm(current->mm);
205 pprev = &mm->mmap;
206 for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
207 struct file *file;
209 retval = -ENOMEM;
210 tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
211 if (!tmp)
212 goto fail_nomem;
213 *tmp = *mpnt;
214 tmp->vm_flags &= ~VM_LOCKED;
215 tmp->vm_mm = mm;
216 mm->map_count++;
217 tmp->vm_next = NULL;
218 file = tmp->vm_file;
219 if (file) {
220 file->f_count++;
221 if (tmp->vm_flags & VM_DENYWRITE)
222 file->f_dentry->d_inode->i_writecount--;
224 /* insert tmp into the share list, just after mpnt */
225 if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
226 mpnt->vm_next_share->vm_pprev_share =
227 &tmp->vm_next_share;
228 mpnt->vm_next_share = tmp;
229 tmp->vm_pprev_share = &mpnt->vm_next_share;
232 /* Copy the pages, but defer checking for errors */
233 retval = copy_page_range(mm, current->mm, tmp);
234 if (!retval && tmp->vm_ops && tmp->vm_ops->open)
235 tmp->vm_ops->open(tmp);
238 * Link in the new vma even if an error occurred,
239 * so that exit_mmap() can clean up the mess.
241 if((tmp->vm_next = *pprev) != NULL)
242 (*pprev)->vm_pprev = &tmp->vm_next;
243 *pprev = tmp;
244 tmp->vm_pprev = pprev;
246 pprev = &tmp->vm_next;
247 if (retval)
248 goto fail_nomem;
250 retval = 0;
252 fail_nomem:
253 flush_tlb_mm(current->mm);
254 return retval;
258 * Allocate and initialize an mm_struct.
260 * NOTE! The mm mutex will be locked until the
261 * caller decides that all systems are go..
263 struct mm_struct * mm_alloc(void)
265 struct mm_struct * mm;
267 mm = kmem_cache_alloc(mm_cachep, SLAB_KERNEL);
268 if (mm) {
269 *mm = *current->mm;
270 init_new_context(mm);
271 atomic_set(&mm->count, 1);
272 mm->map_count = 0;
273 mm->def_flags = 0;
274 mm->mmap_sem = MUTEX_LOCKED;
276 * Leave mm->pgd set to the parent's pgd
277 * so that pgd_offset() is always valid.
279 mm->mmap = mm->mmap_cache = NULL;
281 /* It has not run yet, so cannot be present in anyone's
282 * cache or tlb.
284 mm->cpu_vm_mask = 0;
286 return mm;
290 * Decrement the use count and release all resources for an mm.
292 void mmput(struct mm_struct *mm)
294 if (atomic_dec_and_test(&mm->count)) {
295 release_segments(mm);
296 exit_mmap(mm);
297 free_page_tables(mm);
298 kmem_cache_free(mm_cachep, mm);
302 static inline int copy_mm(int nr, unsigned long clone_flags, struct task_struct * tsk)
304 struct mm_struct * mm;
305 int retval;
307 if (clone_flags & CLONE_VM) {
308 mmget(current->mm);
310 * Set up the LDT descriptor for the clone task.
312 copy_segments(nr, tsk, NULL);
313 SET_PAGE_DIR(tsk, current->mm->pgd);
314 return 0;
317 retval = -ENOMEM;
318 mm = mm_alloc();
319 if (!mm)
320 goto fail_nomem;
322 tsk->mm = mm;
323 tsk->min_flt = tsk->maj_flt = 0;
324 tsk->cmin_flt = tsk->cmaj_flt = 0;
325 tsk->nswap = tsk->cnswap = 0;
326 copy_segments(nr, tsk, mm);
327 retval = new_page_tables(tsk);
328 if (retval)
329 goto free_mm;
330 retval = dup_mmap(mm);
331 if (retval)
332 goto free_pt;
333 up(&mm->mmap_sem);
334 return 0;
336 free_mm:
337 mm->pgd = NULL;
338 free_pt:
339 tsk->mm = NULL;
340 mmput(mm);
341 fail_nomem:
342 return retval;
345 static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
347 if (clone_flags & CLONE_FS) {
348 atomic_inc(&current->fs->count);
349 return 0;
351 tsk->fs = kmalloc(sizeof(*tsk->fs), GFP_KERNEL);
352 if (!tsk->fs)
353 return -1;
354 atomic_set(&tsk->fs->count, 1);
355 tsk->fs->umask = current->fs->umask;
356 tsk->fs->root = dget(current->fs->root);
357 tsk->fs->pwd = dget(current->fs->pwd);
358 return 0;
361 /* return value is only accurate by +-sizeof(long)*8 fds */
362 /* XXX make this architecture specific */
363 static inline int __copy_fdset(unsigned long *d, unsigned long *src)
365 int i;
366 unsigned long *p = src;
367 unsigned long *max = src;
369 for (i = __FDSET_LONGS; i; --i) {
370 if ((*d++ = *p++) != 0)
371 max = p;
373 return (max - src)*sizeof(long)*8;
376 static inline int copy_fdset(fd_set *dst, fd_set *src)
378 return __copy_fdset(dst->fds_bits, src->fds_bits);
381 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
383 struct files_struct *oldf, *newf;
384 struct file **old_fds, **new_fds;
385 int size, i, error = 0;
388 * A background process may not have any files ...
390 oldf = current->files;
391 if (!oldf)
392 goto out;
394 if (clone_flags & CLONE_FILES) {
395 atomic_inc(&oldf->count);
396 goto out;
399 tsk->files = NULL;
400 error = -ENOMEM;
401 newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
402 if (!newf)
403 goto out;
406 * Allocate the fd array, using get_free_page() if possible.
407 * Eventually we want to make the array size variable ...
409 size = NR_OPEN * sizeof(struct file *);
410 if (size == PAGE_SIZE)
411 new_fds = (struct file **) __get_free_page(GFP_KERNEL);
412 else
413 new_fds = (struct file **) kmalloc(size, GFP_KERNEL);
414 if (!new_fds)
415 goto out_release;
416 memset((void *) new_fds, 0, size);
418 atomic_set(&newf->count, 1);
419 newf->max_fds = NR_OPEN;
420 newf->fd = new_fds;
421 newf->close_on_exec = oldf->close_on_exec;
422 i = copy_fdset(&newf->open_fds, &oldf->open_fds);
424 old_fds = oldf->fd;
425 for (; i != 0; i--) {
426 struct file * f = *old_fds;
427 old_fds++;
428 *new_fds = f;
429 if (f)
430 f->f_count++;
431 new_fds++;
433 tsk->files = newf;
434 error = 0;
435 out:
436 return error;
438 out_release:
439 kmem_cache_free(files_cachep, newf);
440 goto out;
443 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
445 if (clone_flags & CLONE_SIGHAND) {
446 atomic_inc(&current->sig->count);
447 return 0;
449 tsk->sig = kmalloc(sizeof(*tsk->sig), GFP_KERNEL);
450 if (!tsk->sig)
451 return -1;
452 spin_lock_init(&tsk->sig->siglock);
453 atomic_set(&tsk->sig->count, 1);
454 memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action));
455 return 0;
458 static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
460 unsigned long new_flags = p->flags;
462 new_flags &= ~PF_SUPERPRIV;
463 new_flags |= PF_FORKNOEXEC;
464 if (!(clone_flags & CLONE_PTRACE))
465 new_flags &= ~(PF_PTRACED|PF_TRACESYS);
466 p->flags = new_flags;
470 * Ok, this is the main fork-routine. It copies the system process
471 * information (task[nr]) and sets up the necessary registers. It
472 * also copies the data segment in its entirety.
474 int do_fork(unsigned long clone_flags, unsigned long usp, struct pt_regs *regs)
476 int nr;
477 int retval = -ENOMEM;
478 struct task_struct *p;
480 p = alloc_task_struct();
481 if (!p)
482 goto fork_out;
484 *p = *current;
486 down(&current->mm->mmap_sem);
487 lock_kernel();
489 if (p->user) {
490 if (p->user->task_count >= p->rlim[RLIMIT_NPROC].rlim_cur)
491 goto bad_fork_free;
495 struct task_struct **tslot;
496 tslot = find_empty_process();
497 retval = -EAGAIN;
498 if (!tslot)
499 goto bad_fork_free;
500 p->tarray_ptr = tslot;
501 *tslot = p;
502 nr = tslot - &task[0];
505 if (p->exec_domain && p->exec_domain->module)
506 __MOD_INC_USE_COUNT(p->exec_domain->module);
507 if (p->binfmt && p->binfmt->module)
508 __MOD_INC_USE_COUNT(p->binfmt->module);
510 p->did_exec = 0;
511 p->swappable = 0;
512 p->state = TASK_UNINTERRUPTIBLE;
514 copy_flags(clone_flags, p);
515 p->pid = get_pid(clone_flags);
518 * This is a "shadow run" state. The process
519 * is marked runnable, but isn't actually on
520 * any run queue yet.. (that happens at the
521 * very end).
523 p->state = TASK_RUNNING;
524 p->next_run = p;
525 p->prev_run = p;
527 p->p_pptr = p->p_opptr = current;
528 p->p_cptr = NULL;
529 init_waitqueue(&p->wait_chldexit);
531 p->sigpending = 0;
532 sigemptyset(&p->signal);
533 p->sigqueue = NULL;
534 p->sigqueue_tail = &p->sigqueue;
536 p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
537 p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
538 init_timer(&p->real_timer);
539 p->real_timer.data = (unsigned long) p;
541 p->leader = 0; /* session leadership doesn't inherit */
542 p->tty_old_pgrp = 0;
543 p->times.tms_utime = p->times.tms_stime = 0;
544 p->times.tms_cutime = p->times.tms_cstime = 0;
545 #ifdef __SMP__
547 int i;
548 p->has_cpu = 0;
549 p->processor = NO_PROC_ID;
550 /* ?? should we just memset this ?? */
551 for(i = 0; i < smp_num_cpus; i++)
552 p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
553 spin_lock_init(&p->sigmask_lock);
555 #endif
556 p->lock_depth = -1; /* -1 = no lock */
557 p->start_time = jiffies;
560 /* This makes it visible to the rest of the system */
561 unsigned long flags;
562 write_lock_irqsave(&tasklist_lock, flags);
563 SET_LINKS(p);
564 hash_pid(p);
565 write_unlock_irqrestore(&tasklist_lock, flags);
568 nr_tasks++;
569 if (p->user)
570 p->user->task_count++;
572 retval = -ENOMEM;
573 /* copy all the process information */
574 if (copy_files(clone_flags, p))
575 goto bad_fork_cleanup;
576 if (copy_fs(clone_flags, p))
577 goto bad_fork_cleanup_files;
578 if (copy_sighand(clone_flags, p))
579 goto bad_fork_cleanup_fs;
580 if (copy_mm(nr, clone_flags, p))
581 goto bad_fork_cleanup_sighand;
582 retval = copy_thread(nr, clone_flags, usp, p, regs);
583 if (retval)
584 goto bad_fork_cleanup_sighand;
585 p->semundo = NULL;
587 /* ok, now we should be set up.. */
588 p->swappable = 1;
589 p->exit_signal = clone_flags & CSIGNAL;
590 p->pdeath_signal = 0;
593 * "share" dynamic priority between parent and child, thus the
594 * total amount of dynamic priorities in the system doesnt change,
595 * more scheduling fairness. This is only important in the first
596 * timeslice, on the long run the scheduling behaviour is unchanged.
598 current->counter >>= 1;
599 p->counter = current->counter;
601 /* Ok, add it to the run-queues, let it rip! */
602 retval = p->pid;
603 if (retval) {
604 p->next_run = NULL;
605 p->prev_run = NULL;
606 wake_up_process(p); /* do this last */
608 ++total_forks;
609 bad_fork:
610 up(&current->mm->mmap_sem);
611 unlock_kernel();
612 fork_out:
613 return retval;
615 bad_fork_cleanup_sighand:
616 exit_sighand(p);
617 bad_fork_cleanup_fs:
618 exit_fs(p); /* blocking */
619 bad_fork_cleanup_files:
620 exit_files(p); /* blocking */
621 bad_fork_cleanup:
622 if (p->exec_domain && p->exec_domain->module)
623 __MOD_DEC_USE_COUNT(p->exec_domain->module);
624 if (p->binfmt && p->binfmt->module)
625 __MOD_DEC_USE_COUNT(p->binfmt->module);
628 unsigned long flags;
629 write_lock_irqsave(&tasklist_lock, flags);
630 unhash_pid(p);
631 REMOVE_LINKS(p);
632 write_unlock_irqrestore(&tasklist_lock, flags);
635 if (p->user)
636 p->user->task_count++;
637 nr_tasks--;
638 add_free_taskslot(p->tarray_ptr);
639 bad_fork_free:
640 free_task_struct(p);
641 goto bad_fork;
644 static void files_ctor(void *fp, kmem_cache_t *cachep, unsigned long flags)
646 struct files_struct *f = fp;
648 memset(f, 0, sizeof(*f));
651 __initfunc(void filescache_init(void))
653 files_cachep = kmem_cache_create("files_cache",
654 sizeof(struct files_struct),
656 SLAB_HWCACHE_ALIGN,
657 files_ctor, NULL);
658 if (!files_cachep)
659 panic("Cannot create files cache");